From 785bd887dfad210d957c7e17a5cb6294136e25e9 Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Wed, 28 Aug 2024 18:14:14 +0800 Subject: [PATCH 1/4] Update llpc from commit 495b235c [Continuations] Refactor 'TypeLowering::lowerFunctionArguments' Fix performance gap caused by LDS bank conflict [CompilerUtils] Assert return types are not changed [CompilerUtils] Fixup alloca alignment in type lowering [llvmraytracing] Remove plugin pass info function for dynamically loaded plugins [CompilerUtils] Add LLVM plugin Remove deprecated assert Cleanup useless VgtGsVertItemsize calculation assignment [Continuations] Support '_AmdIsLlpc' intrinsic Remove deprecated interface forceEnablePrimStats Fix build failure due to llvm memory issue [Continuations] Introduce 'RCR' argument for 'lgc.cps.jump', refactor wait mask handling Add support for no return image sample intrinsics in builder [Continuations] Improve payload size reporting lgc : replace threadMaskedSelect function to use inverseballot [Continuations] Move 'lgc.ilcps.return' op building to 'LowerRaytracingPipeline' Compiler::buildShaderCacheHash: Fix fragment hash [Continuations] Improve stats reporting Force wave64 mode for legacy GS on GFX10 Change in/out semantics representation [Continuations] Remove payload size reporting in bytes Fix component addressing issue of 64-bit FS input lgc : fix logic issues of tryOptimizeWorkgroupId Implement the mapping from CLIP_DIST_4/5/6/7 to CLIP_DIST_0/1/2/3 accordingly [Continuations] Add 'complete' intrinsic, generate 'lgc.cps.jump' Disable AllowReassoc for reflect operations Correct processing of mesh shader output semantics [RT] Don't compile kernel entry for pipeline library lgc : fix global pointer access for buffer strided pointer [RT] Support AmdTraceRayInitStaticId Record debugprintf usermapping to the elf [Continuations] Add payload serialization alloca for Traversal in lgc.cps mode [Continuations] Use 'lgc.cps.complete' for program termination Rename some classes and files in lower pass Add strix1 support --- .clang-format | 8 +- .typos.toml | 2 +- compilerutils/CMakeLists.txt | 2 + .../include/compilerutils/CompilerUtils.h | 9 + .../include/compilerutils/DxilToLlvm.h | 78 + compilerutils/lib/CompilerUtils.cpp | 19 + compilerutils/lib/DxilToLlvm.cpp | 352 ++ compilerutils/lib/PassRegistry.inc | 43 + compilerutils/lib/TypeLowering.cpp | 39 +- compilerutils/plugin/CMakeLists.txt | 11 + compilerutils/plugin/Plugin.cpp | 40 + compilerutils/test/CMakeLists.txt | 2 +- .../test/dxil-to-llvm/simple-i1-vec.dxil | 116 + compilerutils/test/lit.cfg.py | 2 +- gfxruntime/src/shaders/AdvancedBlend.hlsl | 74 +- include/vkgcDefs.h | 21 +- lgc/CMakeLists.txt | 5 + lgc/builder/ArithBuilder.cpp | 9 + lgc/builder/BuilderImpl.cpp | 32 + lgc/builder/BuilderRecorder.cpp | 9 +- lgc/builder/ImageBuilder.cpp | 156 +- lgc/builder/InOutBuilder.cpp | 51 +- lgc/builder/MatrixBuilder.cpp | 27 + lgc/builder/SubgroupBuilder.cpp | 48 +- lgc/include/lgc/builder/BuilderImpl.h | 1 + lgc/include/lgc/patch/LowerDebugPrintf.h | 2 + lgc/include/lgc/patch/LowerGpuRt.h | 3 + lgc/include/lgc/patch/PatchBufferOp.h | 3 +- lgc/include/lgc/patch/PatchEntryPointMutate.h | 4 - .../lgc/patch/PatchInOutImportExport.h | 6 - lgc/include/lgc/patch/PatchResourceCollect.h | 1 + .../lgc/patch/WorkaroundDsSubdwordWrite.h | 59 + lgc/include/lgc/state/AbiMetadata.h | 2 +- lgc/include/lgc/state/PipelineState.h | 6 +- lgc/include/lgc/state/ResourceUsage.h | 24 +- lgc/include/lgc/util/WorkgroupLayout.h | 48 + lgc/interface/lgc/BuilderCommon.h | 3 + lgc/interface/lgc/BuiltIns.h | 6 +- lgc/interface/lgc/LgcDialect.h | 1 + lgc/interface/lgc/Pipeline.h | 23 +- lgc/patch/Continufy.cpp | 32 +- lgc/patch/LowerCooperativeMatrix.cpp | 68 +- lgc/patch/LowerDebugPrintf.cpp | 15 +- lgc/patch/LowerGpuRt.cpp | 12 + lgc/patch/LowerPopsInterlock.cpp | 387 ++ lgc/patch/LowerPopsInterlock.h | 68 + lgc/patch/MeshTaskShader.cpp | 96 +- lgc/patch/MeshTaskShader.h | 4 +- lgc/patch/NggPrimShader.cpp | 9 +- lgc/patch/PassRegistry.inc | 4 + lgc/patch/Patch.cpp | 13 +- lgc/patch/PatchBufferOp.cpp | 51 +- lgc/patch/PatchEntryPointMutate.cpp | 199 - lgc/patch/PatchInOutImportExport.cpp | 215 +- lgc/patch/PatchResourceCollect.cpp | 52 +- lgc/patch/RegisterMetadataBuilder.cpp | 188 +- lgc/patch/ShaderInputs.cpp | 111 +- lgc/patch/ShaderMerger.cpp | 13 +- lgc/patch/SystemValues.cpp | 10 +- lgc/patch/WorkaroundDsSubdwordWrite.cpp | 101 + lgc/state/PipelineState.cpp | 11 +- lgc/state/TargetInfo.cpp | 28 + lgc/test/ImageSampleNoReturn.lgc | 45 + lgc/test/Transforms/Continufy/simple.lgc | 6 +- .../LowerCooperativeMatrix/bf16muladd.lgc | 31 + .../LowerCooperativeMatrix/convert.lgc | 17 + .../Transforms/LowerDebugPrintf/basic.lgc | 1 + .../LowerGpuRt/init-static-id-op.lgc | 20 + lgc/test/WorkgroupIdOpt.lgc | 187 + .../gfx1150_ds_subdword_workaround.lgc | 85 + lgc/tool/lgc/lgc.cpp | 4 +- lgc/util/WorkgroupLayout.cpp | 211 ++ llpc/CMakeLists.txt | 28 +- llpc/context/llpcCompiler.cpp | 51 +- llpc/context/llpcContext.cpp | 35 +- llpc/context/llpcGraphicsContext.cpp | 3 + llpc/include/llpc.h | 1 + ...erAccessChain.cpp => LowerAccessChain.cpp} | 24 +- ...vLowerAccessChain.h => LowerAccessChain.h} | 10 +- llpc/lower/LowerAdvancedBlend.cpp | 65 +- llpc/lower/LowerAdvancedBlend.h | 3 +- ...vLowerCfgMerges.cpp => LowerCfgMerges.cpp} | 16 +- ...SpirvLowerCfgMerges.h => LowerCfgMerges.h} | 8 +- ...Store.cpp => LowerConstImmediateStore.cpp} | 4 +- ...iateStore.h => LowerConstImmediateStore.h} | 2 +- ...eMatrix.cpp => LowerCooperativeMatrix.cpp} | 6 +- ...ativeMatrix.h => LowerCooperativeMatrix.h} | 4 +- llpc/lower/LowerGLCompatibility.cpp | 154 +- llpc/lower/LowerGLCompatibility.h | 2 + ...cSpirvLowerGlobal.cpp => LowerGlobals.cpp} | 98 +- ...{llpcSpirvLowerGlobal.h => LowerGlobals.h} | 8 +- ...MetaRemove.cpp => LowerInstMetaRemove.cpp} | 4 +- ...InstMetaRemove.h => LowerInstMetaRemove.h} | 2 +- ....cpp => LowerInternalLibraryIntrinsic.cpp} | 6 +- ...Util.h => LowerInternalLibraryIntrinsic.h} | 4 +- .../{llpcSpirvLowerMath.cpp => LowerMath.cpp} | 4 +- .../{llpcSpirvLowerMath.h => LowerMath.h} | 4 +- ...irvLowerMemoryOp.cpp => LowerMemoryOp.cpp} | 32 +- ...pcSpirvLowerMemoryOp.h => LowerMemoryOp.h} | 10 +- ...owerRayTracing.cpp => LowerRayTracing.cpp} | 6 +- ...irvLowerRayTracing.h => LowerRayTracing.h} | 4 +- ...owerTerminator.cpp => LowerTerminator.cpp} | 4 +- ...irvLowerTerminator.h => LowerTerminator.h} | 4 +- ...owerTranslator.cpp => LowerTranslator.cpp} | 4 +- ...irvLowerTranslator.h => LowerTranslator.h} | 2 +- llpc/lower/PassRegistry.inc | 8 +- llpc/lower/PrepareContinuations.cpp | 2 +- llpc/lower/ProcessGfxRuntimeLibrary.cpp | 145 +- llpc/lower/ProcessGfxRuntimeLibrary.h | 7 +- ...uRtLibrary.cpp => ProcessGpuRtLibrary.cpp} | 256 +- ...ssGpuRtLibrary.h => ProcessGpuRtLibrary.h} | 14 +- llpc/lower/llpcSpirvLower.cpp | 37 +- .../shaderdb/core/OpGroupNonUniformMax.comp | 123 +- .../OpExtInst_TestReflectFloat_lit.frag | 5 + ...tNumComponentsWithReversedAccessOrder.mesh | 2 +- .../shaderdb/general/UndefVertexOutput.spvasm | 10 +- .../shaderdb/gfx11/SgprUserDataInit_Fs.pipe | 2 - .../ray_tracing/TestKnownRayFlags.pipe | 108 + .../ray_tracing/TestKnownRayFlags.rgen | 30 - .../PipelineGs_TestOutputLocations.pipe | 69 - .../PipelineVsFs_FillPsInput.pipe | 23 +- ...elineVsFs_TestRelocatableInOutMapping.pipe | 11 +- llpc/tool/amdllpc.cpp | 1 + llpc/translator/lib/SPIRV/SPIRVReader.cpp | 38 +- llpc/translator/lib/SPIRV/SPIRVReader.h | 3 +- llpc/translator/lib/SPIRV/hex_float.h | 43 + .../lib/SPIRV/libSPIRV/SPIRVModule.cpp | 6 +- .../lib/SPIRV/libSPIRV/SPIRVModule.h | 2 +- .../lib/SPIRV/libSPIRV/SPIRVType.cpp | 4 + .../translator/lib/SPIRV/libSPIRV/SPIRVType.h | 13 +- llpc/util/llpcShaderModuleHelper.cpp | 1 + llvmraytracing/include/lgc/GpurtDialect.td | 6 + llvmraytracing/include/lgc/LgcCpsDialect.td | 15 +- llvmraytracing/include/lgc/LgcIlCpsDialect.td | 18 - llvmraytracing/include/lgc/LgcRtDialect.h | 3 + .../llvmraytracing/ContinuationsUtil.h | 25 +- llvmraytracing/lib/CleanupContinuations.cpp | 51 +- llvmraytracing/lib/Continuations.cpp | 71 +- llvmraytracing/lib/ContinuationsLint.cpp | 19 +- .../lib/ContinuationsStatsReport.cpp | 106 +- .../lib/DXILContIntrinsicPrepare.cpp | 18 +- llvmraytracing/lib/DXILContPostProcess.cpp | 97 +- .../lib/LegacyCleanupContinuations.cpp | 107 +- llvmraytracing/lib/LgcCpsJumpInliner.cpp | 2 + llvmraytracing/lib/LgcRtDialect.cpp | 7 + llvmraytracing/lib/LowerAwait.cpp | 45 +- llvmraytracing/lib/LowerRayQuery.cpp | 9 +- .../lib/LowerRaytracingPipeline.cpp | 293 +- llvmraytracing/plugin/Plugin.cpp | 6 - .../test/dx/cleanup-continuations-malloc.ll | 12 +- .../test/dx/cleanup-continuations.ll | 27 +- .../test/dx/closest-hit-procedural.ll | 6 +- .../test/dx/closest-hit-traceray.ll | 18 +- llvmraytracing/test/dx/closest-hit.ll | 4 +- .../test/dx/continuation-registercount.ll | 57 +- llvmraytracing/test/dx/continuation-state.ll | 10 +- .../test/dx/continuation-without-await.ll | 12 +- .../dx/dxil-cont-convert-lgc-rt-op-trace.ll | 14 +- .../test/dx/dxil-cont-intrinsic-prepare.ll | 5 +- .../test/dx/dxil-cont-post-process.ll | 1 - .../test/dx/dxil-cont-prepare-traversal.ll | 10 +- .../test/dx/dxil-cps-stack-lowering-global.ll | 2 +- .../dx/dxil-cps-stack-lowering-scratch.ll | 2 +- .../test/dx/inline-const-jump-target.ll | 6 +- .../test/dx/intersection-registercount.ll | 4 +- llvmraytracing/test/dx/intrinsics/complete.ll | 75 + .../cont-payload-registers-get-i32.ll | 11 +- .../cont-payload-registers-i32-count.ll | 3 +- .../cont-payload-registers-set-i32.ll | 11 +- .../dx/intrinsics/get-current-func-addr.ll | 4 +- llvmraytracing/test/dx/intrinsics/get-rtip.ll | 2 +- .../test/dx/intrinsics/get-shader-kind.ll | 4 +- llvmraytracing/test/dx/intrinsics/is-llpc.ll | 26 + .../test/dx/intrinsics/shader-index.ll | 7 +- .../test/dx/intrinsics/value-i32.ll | 2 +- .../test/dx/lint/undef-jump-target.ll | 4 +- llvmraytracing/test/dx/lower-await.ll | 56 +- .../test/dx/lower-rt-pipeline-call-shader.ll | 10 +- .../test/dx/lower-rt-pipeline-exit-raygen.ll | 3 +- .../dx/lower-rt-pipeline-intrinsics-hit.ll | 31 +- .../test/dx/lower-rt-pipeline-intrinsics.ll | 42 +- .../dx/lower-rt-pipeline-large-payload.ll | 68 +- .../lower-rt-pipeline-simple-call-shader.ll | 4 +- .../lower-rt-pipeline-small-payload-field.ll | 4 +- llvmraytracing/test/dx/lower-rt-pipeline.ll | 88 +- .../test/dx/paq-hit-attribute-size.ll | 40 +- .../test/dx/payload-caller-in-paq.ll | 17 +- .../test/dx/payload-save-registers.ll | 32 +- llvmraytracing/test/dx/payload.ll | 3331 ++++++++++++++++- llvmraytracing/test/dx/remat-intrinsic.ll | 18 +- llvmraytracing/test/dx/stats-report-sizes.ll | 14 +- llvmraytracing/test/dx/traceray.ll | 174 +- .../test/dx/traversal-empty-payload.ll | 23 +- .../test/dx/traversal-passthrough-payload.ll | 21 +- .../test/dx/unnamed-type-intrinsics.ll | 15 +- .../test/intrinsics/get-func-addr.ll | 5 +- .../test/intrinsics/shader-start.ll | 2 +- .../test/lgccps/entry-point-with-cps.ll | 12 +- .../cont-payload-registers-get-i32.ll | 50 + .../cont-payload-registers-i32-count.ll | 52 + .../cont-payload-registers-set-i32.ll | 46 + llvmraytracing/test/lgccps/lower-traversal.ll | 38 +- tool/dumper/vkgcPipelineDumper.cpp | 15 +- tool/vfx/vfxVkSection.h | 7 +- util/gpurtshim/GpurtShim.cpp | 2 +- version/CMakeLists.txt | 12 + version/include/llpc/GpurtIntrinsics.h | 3 + version/include/llpcVersion.h.in | 6 +- 208 files changed, 8328 insertions(+), 2366 deletions(-) create mode 100644 compilerutils/include/compilerutils/DxilToLlvm.h create mode 100644 compilerutils/lib/DxilToLlvm.cpp create mode 100644 compilerutils/lib/PassRegistry.inc create mode 100644 compilerutils/plugin/CMakeLists.txt create mode 100644 compilerutils/plugin/Plugin.cpp create mode 100644 compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil create mode 100644 lgc/include/lgc/patch/WorkaroundDsSubdwordWrite.h create mode 100644 lgc/include/lgc/util/WorkgroupLayout.h create mode 100644 lgc/patch/LowerPopsInterlock.cpp create mode 100644 lgc/patch/LowerPopsInterlock.h create mode 100644 lgc/patch/WorkaroundDsSubdwordWrite.cpp create mode 100644 lgc/test/ImageSampleNoReturn.lgc create mode 100644 lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc create mode 100644 lgc/test/Transforms/LowerGpuRt/init-static-id-op.lgc create mode 100644 lgc/test/WorkgroupIdOpt.lgc create mode 100644 lgc/test/shaderdb/gfx1150_ds_subdword_workaround.lgc create mode 100644 lgc/util/WorkgroupLayout.cpp rename llpc/lower/{llpcSpirvLowerAccessChain.cpp => LowerAccessChain.cpp} (93%) rename llpc/lower/{llpcSpirvLowerAccessChain.h => LowerAccessChain.h} (91%) rename llpc/lower/{llpcSpirvLowerCfgMerges.cpp => LowerCfgMerges.cpp} (98%) rename llpc/lower/{llpcSpirvLowerCfgMerges.h => LowerCfgMerges.h} (88%) rename llpc/lower/{llpcSpirvLowerConstImmediateStore.cpp => LowerConstImmediateStore.cpp} (99%) rename llpc/lower/{llpcSpirvLowerConstImmediateStore.h => LowerConstImmediateStore.h} (98%) rename llpc/lower/{llpcSpirvLowerCooperativeMatrix.cpp => LowerCooperativeMatrix.cpp} (97%) rename llpc/lower/{llpcSpirvLowerCooperativeMatrix.h => LowerCooperativeMatrix.h} (94%) rename llpc/lower/{llpcSpirvLowerGlobal.cpp => LowerGlobals.cpp} (96%) rename llpc/lower/{llpcSpirvLowerGlobal.h => LowerGlobals.h} (97%) rename llpc/lower/{llpcSpirvLowerInstMetaRemove.cpp => LowerInstMetaRemove.cpp} (97%) rename llpc/lower/{llpcSpirvLowerInstMetaRemove.h => LowerInstMetaRemove.h} (98%) rename llpc/lower/{llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp => LowerInternalLibraryIntrinsic.cpp} (98%) rename llpc/lower/{llpcSpirvLowerInternalLibraryIntrinsicUtil.h => LowerInternalLibraryIntrinsic.h} (94%) rename llpc/lower/{llpcSpirvLowerMath.cpp => LowerMath.cpp} (99%) rename llpc/lower/{llpcSpirvLowerMath.h => LowerMath.h} (97%) rename llpc/lower/{llpcSpirvLowerMemoryOp.cpp => LowerMemoryOp.cpp} (92%) rename llpc/lower/{llpcSpirvLowerMemoryOp.h => LowerMemoryOp.h} (93%) rename llpc/lower/{llpcSpirvLowerRayTracing.cpp => LowerRayTracing.cpp} (99%) rename llpc/lower/{llpcSpirvLowerRayTracing.h => LowerRayTracing.h} (99%) rename llpc/lower/{llpcSpirvLowerTerminator.cpp => LowerTerminator.cpp} (98%) rename llpc/lower/{llpcSpirvLowerTerminator.h => LowerTerminator.h} (95%) rename llpc/lower/{llpcSpirvLowerTranslator.cpp => LowerTranslator.cpp} (98%) rename llpc/lower/{llpcSpirvLowerTranslator.h => LowerTranslator.h} (98%) rename llpc/lower/{llpcSpirvProcessGpuRtLibrary.cpp => ProcessGpuRtLibrary.cpp} (82%) rename llpc/lower/{llpcSpirvProcessGpuRtLibrary.h => ProcessGpuRtLibrary.h} (91%) create mode 100644 llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.pipe delete mode 100644 llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.rgen delete mode 100644 llpc/test/shaderdb/relocatable_shaders/PipelineGs_TestOutputLocations.pipe create mode 100644 llvmraytracing/test/dx/intrinsics/complete.ll create mode 100644 llvmraytracing/test/dx/intrinsics/is-llpc.ll create mode 100644 llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll create mode 100644 llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll create mode 100644 llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll diff --git a/.clang-format b/.clang-format index 6a1adea01e..354660d163 100644 --- a/.clang-format +++ b/.clang-format @@ -4,10 +4,12 @@ AllowShortFunctionsOnASingleLine: InlineOnly IncludeBlocks: Merge IncludeCategories: - Regex: '^"lgc/' - Priority: 2 - - Regex: '^"(llvm|llvm-c|llvm-dialects|clang|clang-c)/' Priority: 3 - - Regex: '^(<|"(gtest|gmock|isl|json)/)' + - Regex: '^"(llvm|llvm-c|llvm-dialects|clang|clang-c)/' Priority: 4 + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 5 + - Regex: '.*/' + Priority: 2 - Regex: '.*' Priority: 1 diff --git a/.typos.toml b/.typos.toml index 7776161986..d94ecdccc3 100644 --- a/.typos.toml +++ b/.typos.toml @@ -13,6 +13,7 @@ UE = "UE" OpBuildNDRange = "OpBuildNDRange" serDataNode = "serDataNode" rcall = "rcall" +Fo = "Fo" [default.extend-words] ba = "ba" @@ -28,4 +29,3 @@ dne = "dne" offen = "offen" varing = "varing" Derivate = "Derivate" -Fo = "Fo" diff --git a/compilerutils/CMakeLists.txt b/compilerutils/CMakeLists.txt index 1aae6b3b59..4aa8824093 100644 --- a/compilerutils/CMakeLists.txt +++ b/compilerutils/CMakeLists.txt @@ -14,6 +14,7 @@ endfunction() add_llvm_library(LLVMCompilerUtils lib/ArgPromotion.cpp lib/CompilerUtils.cpp + lib/DxilToLlvm.cpp lib/TypeLowering.cpp lib/TypesMetadata.cpp @@ -39,5 +40,6 @@ set_compiler_options(LLVMCompilerUtils) target_compile_features(LLVMCompilerUtils PUBLIC cxx_std_17) set_target_properties(LLVMCompilerUtils PROPERTIES CXX_EXTENSIONS OFF) +add_subdirectory(plugin) add_subdirectory(tool/cross-module-inline) add_subdirectory(test) diff --git a/compilerutils/include/compilerutils/CompilerUtils.h b/compilerutils/include/compilerutils/CompilerUtils.h index 745d6adb5d..e273499d47 100644 --- a/compilerutils/include/compilerutils/CompilerUtils.h +++ b/compilerutils/include/compilerutils/CompilerUtils.h @@ -38,8 +38,17 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/IRBuilder.h" +namespace llvm { + +class PassBuilder; + +} // namespace llvm + namespace CompilerUtils { +// Register compiler utils passes. +void RegisterPasses(llvm::PassBuilder &PB); + // Create an LLVM function call to the named function. The callee is built // automatically based on return type and its parameters. // diff --git a/compilerutils/include/compilerutils/DxilToLlvm.h b/compilerutils/include/compilerutils/DxilToLlvm.h new file mode 100644 index 0000000000..346f1d19d9 --- /dev/null +++ b/compilerutils/include/compilerutils/DxilToLlvm.h @@ -0,0 +1,78 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +//===- DxilToLlvm.h - --------------------------------------------------------------------------------------------===// +// +// This pass converts a DXIL module into an LLVM module by fixing constructs that have different semantics in the two. +// The output module will still contain DXIL intrinsics and metadata, because we only fix incompatibilities, and don't +// lower away DXIL. +// +// The following modifications are made: +// +// * i1 vectors are replaced by i32 vectors +// This works around a more general difference between DXIL and LLVM: +// In LLVM, vectors are always bit-packed and ignore the elements' alignment. +// In DXIL, vectors respect the elements' alignment, and i1s have 32-bit alignment. +// Thus, in DXIL, the elements of <2 x i1> are 32 bits apart, while they are bit-packed in LLVM, +// and DXC relies on this by bit-casting allocas between <2 x i1> and <2 x i32>. +// This only seems to affect HLSL i1 *matrices*, which are lowered to arrays of i1 vectors in DXIL, +// and not HLSL i1 vectors, which are lowered to i32 arrays in DXIL. +// To fix this, we replace all i1 vectors by i32 vectors. +// We don't apply the same to other vectors that were overaligned in the original DXIL data layout (e.g. i16) +// because this may harm performance, and we haven't observed cases yet where DXC relies on this layout. +// See https://github.com/microsoft/DirectXShaderCompiler/issues/6082 for some background. +// +// Further known, not yet handled differences: +// +// * vectors of non-i1 elements that are overaligned in DXIL (see above) +// * potentially: overaligned types in general +// After importing DXIL modules, we change the data layout to match what the backend does. Doing so potentially +// breaks the module if it relies on the existing DL. For instance, after changing the alignment of i16 from 32 to 16, +// storing as [4 x i16] and reading back the second dword behaves differently. Strictly speaking, when changing the +// DL, we would need to update such occurrences. We don't do that because we haven't yet observed such cases, and +// because it is difficult in general. For instance, we could transparently replace i16s by i32s to preserve the +// 32-bit size, but replacing half by float is more problematic. Although there is no spec, it appears DXC tries to +// emit DXIL that supports such DL changes, by only using structured GEPs and avoiding transformations based on byte +// offsets. Also, such fixups are only possible locally (e.g. for allocas), and not through opaque memory. +// * UDiv/URem/FPTrunc differences +// * fast math flags +// +//===--------------------------------------------------------------------------------------------------------------===// + +#pragma once + +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" + +namespace CompilerUtils { + +class DxilToLlvmPass : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager); + + static llvm::StringRef name() { return "Convert DXIL to LLVM IR"; } +}; + +} // namespace CompilerUtils diff --git a/compilerutils/lib/CompilerUtils.cpp b/compilerutils/lib/CompilerUtils.cpp index 9cbc790228..e454db298a 100644 --- a/compilerutils/lib/CompilerUtils.cpp +++ b/compilerutils/lib/CompilerUtils.cpp @@ -24,6 +24,7 @@ **********************************************************************************************************************/ #include "compilerutils/CompilerUtils.h" +#include "compilerutils/DxilToLlvm.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/iterator_range.h" #include "llvm/IR/Attributes.h" @@ -32,6 +33,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -544,3 +546,20 @@ void CompilerUtils::replaceAllPointerUses(IRBuilder<> *builder, Value *oldPointe assert(PhiElems.empty() && "All phi inputs need to be handled, otherwise we end in an inconsistent state"); #endif } + +void CompilerUtils::RegisterPasses(llvm::PassBuilder &PB) { +#define HANDLE_PASS(NAME, CREATE_PASS) \ + if (innerPipeline.empty() && name == NAME) { \ + passMgr.addPass(CREATE_PASS); \ + return true; \ + } + + PB.registerPipelineParsingCallback( + [](StringRef name, ModulePassManager &passMgr, ArrayRef innerPipeline) { + StringRef Params; + (void)Params; +#define COMPILERUTILS_PASS HANDLE_PASS +#include "PassRegistry.inc" + return false; + }); +} diff --git a/compilerutils/lib/DxilToLlvm.cpp b/compilerutils/lib/DxilToLlvm.cpp new file mode 100644 index 0000000000..47afe786ab --- /dev/null +++ b/compilerutils/lib/DxilToLlvm.cpp @@ -0,0 +1,352 @@ +//===- DxilToLlvm.cpp - Convert DXIL to LLVM IR. -===// +// +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "compilerutils/DxilToLlvm.h" +#include "compilerutils/TypeLowering.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; +using namespace CompilerUtils; + +#define DEBUG_TYPE "dxil-to-llvm" + +namespace { + +// Applies value replacements to values in metadata (ValueAsMetadata). +// Metadata values can be replaced in-place without the need to +// construct new objects. This simplifies the algorithm: +// We just traverse all reachable metadata nodes and update them on the fly. +// TODO: Can this be done more efficiently, e.g. by using ValueAsMetadata in LLVMContextImpl::ValuesAsMetadata? +// This would need LLVM changes though to somehow expose that. +class MetadataUpdater { +public: + MetadataUpdater(Module &module, TypeLowering &typeLower) : m_module{module}, m_typeLower{typeLower} {} + + void run() { + processNamedMetadata(); + processUnnamedMetadata(); + } + +private: + void processNamedMetadata() { + for (NamedMDNode &mdNode : m_module.named_metadata()) { + for (auto node : mdNode.operands()) + processNode(node); + } + } + + void processUnnamedMetadata() { + SmallVector> collectedNodes; + auto processCollectedNodes = [&]() { + for (auto [_, mdNode] : collectedNodes) + processNode(mdNode); + collectedNodes.clear(); + }; + + for (GlobalVariable &global : m_module.globals()) { + global.getAllMetadata(collectedNodes); + processCollectedNodes(); + } + + for (Function &func : m_module) { + func.getAllMetadata(collectedNodes); + processCollectedNodes(); + for (BasicBlock &bb : func) { + for (Instruction &inst : bb) { + inst.getAllMetadata(collectedNodes); + processCollectedNodes(); + } + } + } + } + + // Performs type and value replacements on the given ValueAsMetadata node + void processValueMd(ValueAsMetadata *valueMd) { + Value *oldValue = valueMd->getValue(); + + auto types = m_typeLower.convertType(oldValue->getType()); + assert(types.size() == 1); + if (types[0] == oldValue->getType()) + return; + + Value *newValue = m_typeLower.getValue(oldValue)[0]; + if (newValue != oldValue) { + valueMd->handleRAUW(oldValue, newValue); + LLVM_DEBUG(dbgs() << "Replaced " << *oldValue << " by " << newValue << "\n"); + } else { + LLVM_DEBUG(dbgs() << "Kept value " << *oldValue << "\n"); + } + } + + // For the given node and all reachable nodes (via operands): + // Replace ValueAsMetadata values according to the stored type lowering object. + // Ignores nodes that have already been processed. + void processNode(MDNode *node) { + // Adds a node to the worklist if it hasn't been seen before + auto addToWorklist = [this](MDNode *node) { + bool inserted = m_processed.insert(node).second; + if (inserted) + m_worklist.push_back(node); + }; + + addToWorklist(node); + + while (!m_worklist.empty()) { + MDNode *curNode = m_worklist.pop_back_val(); + const auto *mdTuple = cast(curNode); + for (const MDOperand &operand : mdTuple->operands()) { + Metadata *md = operand.get(); + if (!md) + continue; + if (auto *operandMdNode = dyn_cast(md)) + addToWorklist(operandMdNode); + else if (auto *valueMd = dyn_cast(md)) + processValueMd(valueMd); + else + assert(isa(md)); + } + continue; + } + } + + Module &m_module; + TypeLowering &m_typeLower; + // Bookkeeping data structures for pending and processed nodes. + // These are only used in processNode() + SmallVector m_worklist; + DenseSet m_processed; +}; + +struct DxilToLlvmPassImpl { + DxilToLlvmPassImpl(Module &module) : m_module(module), m_typeLower(module.getContext()) {} + + // Given a type used as element type of a vector, return the replacement type to be used in vectors. + static Type *convertVectorElementType(Type *elemTy) { + // For now, just replace i1 vectors as DXC is known to rely on the layout of i1 vectors + // using pointer bitcasts. + if (elemTy->isIntegerTy(1)) + return Type::getInt32Ty(elemTy->getContext()); + return nullptr; + } + + static SmallVector convertVectorType(TypeLowering &typeLower, Type *ty) { + VectorType *vTy = dyn_cast(ty); + if (!vTy) + return {}; + assert(!vTy->isScalableTy()); + + Type *elemTy = vTy->getElementType(); + Type *convertedElemTy = convertVectorElementType(elemTy); + if (!convertedElemTy) + return {}; + + assert(convertedElemTy != elemTy); + return {VectorType::get(convertedElemTy, vTy->getElementCount())}; + } + + // Wrapper around TypeLowering::convertType unpacking the vector. + Type *getConvertedType(Type *ty) { + auto types = m_typeLower.convertType(ty); + assert(types.size() == 1); + return types[0]; + } + + // Given a value of a converted type that has already been handled, obtain the replaced value. + // Wrapper around TypeLowering::getValue unpacking the vector. + Value *getConvertedValue(Value *value) { + auto convertedValue = m_typeLower.getValue(value); + assert(convertedValue.size() == 1); + return convertedValue[0]; + } + + // Given an integer value that is replaced if part of a vector, create a value + // of the replacement vector element type that can be used in the replaced + // vector value. + // The resulting value is a zext of the original value. + Value *convertIntegerValue(IRBuilder<> &builder, Value *integerValue, Type *targetTy) { + Type *origTy = integerValue->getType(); + assert(targetTy && targetTy != origTy); + if ([[maybe_unused]] IntegerType *origIntegerTy = dyn_cast(origTy)) { + IntegerType *convertedIntegerTy = cast(targetTy); + assert(convertedIntegerTy->getBitWidth() >= origIntegerTy->getBitWidth()); + return builder.CreateZExt(integerValue, convertedIntegerTy); + } + llvm_unreachable("unsupported type"); + } + + // Given a converted integer value, restore a value of the original integer type. + // Assumes the the original type bit width is smaller, and creates a trunc. + Value *restoreIntegerValue(IRBuilder<> &builder, Value *convertedValue, Type *origTy) { + if ([[maybe_unused]] IntegerType *origIntegerTy = dyn_cast(origTy)) { + assert(origIntegerTy->getBitWidth() <= convertedValue->getType()->getIntegerBitWidth()); + return builder.CreateTrunc(convertedValue, origTy); + } + llvm_unreachable("unsupported type"); + } + + // ; %vec is a value of a vector type that is replaced + // ; %val is a value that is replaced within vectors + // %vec.inserted = insertelement <2 x i1> %vec, i1 %val, i32 %idx + // + // ---> + // + // %val.zext = zext i1 %val to i32 + // %vec.inserted.translated = insertelement <2 x i32> %vec.translated, i32 %val.zext, i32 %idx + void visitInsertElement(llvm::InsertElementInst &insertElement) { + Value *element = insertElement.getOperand(1); + if (convertVectorElementType(element->getType()) == nullptr) + return; + + IRBuilder<> builder(&insertElement); + Value *inputVector = insertElement.getOperand(0); + Value *index = insertElement.getOperand(2); + + auto convertedInputVector = getConvertedValue(inputVector); + VectorType *convertedVectorTy = cast(convertedInputVector->getType()); + auto replacedElement = convertIntegerValue(builder, element, convertedVectorTy->getElementType()); + + auto *replacement = + builder.CreateInsertElement(convertedInputVector, replacedElement, index, insertElement.getName()); + m_typeLower.replaceInstruction(&insertElement, replacement); + } + + // ; %vec is a value of a vector type that is replaced + // %val = extractelement <2 x i1> %vec, i32 %idx + // + // ---> + // + // %val.tmp = extractelement <2 x i32> %vec.translated, i32 %idx + // %val.translated = trunc i32 %val.tmp to i1 + void visitExtractElement(llvm::ExtractElementInst &extractElement) { + Value *inputVector = extractElement.getOperand(0); + Type *elementTy = cast(inputVector->getType())->getElementType(); + if (convertVectorElementType(elementTy) == nullptr) + return; + + Value *index = extractElement.getOperand(1); + + auto convertedInputVector = getConvertedValue(inputVector); + + IRBuilder<> builder(&extractElement); + auto *convertedExtract = builder.CreateExtractElement(convertedInputVector, index, extractElement.getName()); + + // Don't need to record any mapping, as the result type is a scalar which isn't replaced, so a RAUW is all we need. + auto restoredElement = restoreIntegerValue(builder, convertedExtract, extractElement.getType()); + extractElement.replaceAllUsesWith(restoredElement); + m_typeLower.eraseInstruction(&extractElement); + } + + void visitGEP(llvm::GetElementPtrInst &gepInst) { + Type *oldTy = gepInst.getSourceElementType(); + Type *newTy = getConvertedType(oldTy); + // We intentionally only replace the GEP source type, and do not + // update indices accordingly. In cases where the new type has + // a different layout, this changes the GEP offset in the LLVM interpretation. + // This is intended: The old and new GEPs have the same offset in the DXIL model, + // which also equals the new GEPs offset in the LLVM model. + if (newTy == oldTy) + return; + + IRBuilder<> builder(&gepInst); + + // Type lowering may have changed pointer values, e.g. by creating a new alloca of matching type, + // so we need to check replacement values. + Value *pointerOperand = gepInst.getPointerOperand(); + auto convertedPointerOperand = m_typeLower.getValueOptional(pointerOperand); + if (!convertedPointerOperand.empty()) + pointerOperand = convertedPointerOperand[0]; + + SmallVector indexList(gepInst.indices()); + auto *convertedGep = builder.CreateGEP(newTy, pointerOperand, indexList, gepInst.getName(), gepInst.isInBounds()); + + gepInst.replaceAllUsesWith(convertedGep); + m_typeLower.eraseInstruction(&gepInst); + } + + void fixFunctionTypes() { + for (Function &function : m_module) + m_typeLower.lowerFunctionArguments(function); + } + + llvm::PreservedAnalyses run() { + m_typeLower.addRule(convertVectorType); + + static const auto visitor = llvm_dialects::VisitorBuilder() + .nest(&TypeLowering::registerVisitors) + .add(&DxilToLlvmPassImpl::visitInsertElement) + .add(&DxilToLlvmPassImpl::visitExtractElement) + .add(&DxilToLlvmPassImpl::visitGEP) + .build(); + fixFunctionTypes(); + + visitor.visit(*this, m_module); + + m_typeLower.finishPhis(); + m_typeLower.finishCleanup(); + + MetadataUpdater mdUpdater(m_module, m_typeLower); + mdUpdater.run(); + + return PreservedAnalyses::none(); + } + Module &m_module; + TypeLowering m_typeLower; +}; + +} // anonymous namespace + +template <> struct llvm_dialects::VisitorPayloadProjection { + static TypeLowering &project(DxilToLlvmPassImpl &p) { return p.m_typeLower; } +}; + +llvm::PreservedAnalyses CompilerUtils::DxilToLlvmPass::run(llvm::Module &module, llvm::ModuleAnalysisManager &) { + DxilToLlvmPassImpl Impl{module}; + return Impl.run(); +} diff --git a/compilerutils/lib/PassRegistry.inc b/compilerutils/lib/PassRegistry.inc new file mode 100644 index 0000000000..13b599016a --- /dev/null +++ b/compilerutils/lib/PassRegistry.inc @@ -0,0 +1,43 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file PassRegistry.inc + * @brief LLPC header file: used as the registry of LLPC lowering passes + *********************************************************************************************************************** + */ + +#ifndef COMPILERUTILS_PASS +#define COMPILERUTILS_PASS(NAME, CREATE_PASS) +#endif + +#ifndef COMPILERUTILS_MODULE_PASS +#define COMPILERUTILS_MODULE_PASS COMPILERUTILS_PASS +#endif + +COMPILERUTILS_MODULE_PASS("dxil-to-llvm", DxilToLlvmPass()) + +#undef COMPILERUTILS_PASS +#undef COMPILERUTILS_MODULE_PASS diff --git a/compilerutils/lib/TypeLowering.cpp b/compilerutils/lib/TypeLowering.cpp index 0346c9794b..b02060f702 100644 --- a/compilerutils/lib/TypeLowering.cpp +++ b/compilerutils/lib/TypeLowering.cpp @@ -25,6 +25,7 @@ #include "compilerutils/TypeLowering.h" #include "compilerutils/CompilerUtils.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" @@ -131,18 +132,32 @@ TypeLowering::TypeLowering(LLVMContext &context) : m_builder(context) { Function *TypeLowering::lowerFunctionArguments(Function &fn) { SmallVector newArgTys; SmallVector remappedArgs; - for (size_t argIdx = 0; argIdx < fn.arg_size(); ++argIdx) { - auto *arg = fn.getArg(argIdx); - auto converted = convertType(arg->getType()); + + // Process arguments + for (const auto &[index, arg] : llvm::enumerate(fn.args())) { + auto converted = convertType(arg.getType()); assert(converted.size() == 1 && "Only 1:1 type remapping supported now"); - if (converted[0] == arg->getType()) { - newArgTys.push_back(arg->getType()); + if (converted[0] == arg.getType()) { + newArgTys.push_back(arg.getType()); } else { - remappedArgs.push_back(argIdx); + remappedArgs.push_back(index); newArgTys.push_back(converted[0]); } } + // Changing return types is currently not supported. + // If a use case arises, we need to be a bit careful: + // We need to first lower function arguments (recording the argument replacement values), + // then process function bodies (recording replacements of SSA values), + // and then fixup ret statements. + // Because invalid IR is temporarily allowed, we should be able to mutate + // function types just once, also changing the return type here, + // and registering cleanup work to fix rets later. + // + // For now, just assert that return types don't need to change. + assert(convertType(fn.getReturnType()) == ArrayRef{fn.getReturnType()} && + "Return type mapping not supported."); + if (remappedArgs.empty()) return &fn; @@ -153,12 +168,10 @@ Function *TypeLowering::lowerFunctionArguments(Function &fn) { // Setup names and replace argument uses except the remapped ones. // The remapped argument will be handled by later instruction visitor. - for (unsigned idx = 0; idx < newFn->arg_size(); idx++) { - Value *oldArg = fn.getArg(idx); - Value *newArg = newFn->getArg(idx); - newArg->setName(oldArg->getName()); - if (!llvm::is_contained(remappedArgs, idx)) - oldArg->replaceAllUsesWith(newArg); + for (const auto &[index, oldArg, newArg] : enumerate(fn.args(), newFn->args())) { + newArg.setName(oldArg.getName()); + if (!llvm::is_contained(remappedArgs, index)) + oldArg.replaceAllUsesWith(&newArg); } m_functionsToErase.push_back(&fn); return newFn; @@ -435,6 +448,8 @@ void TypeLowering::visitAlloca(AllocaInst &alloca) { } else { alloca.setAllocatedType(StructType::get(m_builder.getContext(), types)); } + const DataLayout &DL = alloca.getFunction()->getDataLayout(); + alloca.setAlignment(DL.getPrefTypeAlign(alloca.getAllocatedType())); } // ===================================================================================================================== diff --git a/compilerutils/plugin/CMakeLists.txt b/compilerutils/plugin/CMakeLists.txt new file mode 100644 index 0000000000..b6dad1be9a --- /dev/null +++ b/compilerutils/plugin/CMakeLists.txt @@ -0,0 +1,11 @@ +set(LLVM_COMPILERUTILSPLUGIN_LINK_INTO_TOOLS ON CACHE BOOL "Link plugin into tools" FORCE) + +add_llvm_pass_plugin(CompilerUtilsPlugin + Plugin.cpp + + LINK_COMPONENTS + Support +) + +target_link_libraries(CompilerUtilsPlugin PRIVATE LLVMCompilerUtils) +set_compiler_options(CompilerUtilsPlugin) diff --git a/compilerutils/plugin/Plugin.cpp b/compilerutils/plugin/Plugin.cpp new file mode 100644 index 0000000000..9bee3b45ce --- /dev/null +++ b/compilerutils/plugin/Plugin.cpp @@ -0,0 +1,40 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +//===- Plugin.cpp - LLVM plugin for compilerutils passes ------------------===// +// +// Register compilerutils passes, so they can be used from opt. +// +//===----------------------------------------------------------------------===// + +#include "compilerutils/CompilerUtils.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Passes/PassPlugin.h" + +// New PM registration +llvm::PassPluginLibraryInfo getCompilerUtilsPluginPluginInfo() { + return {LLVM_PLUGIN_API_VERSION, "CompilerUtils", LLVM_VERSION_STRING, + [](llvm::PassBuilder &PB) { CompilerUtils::RegisterPasses(PB); }}; +} diff --git a/compilerutils/test/CMakeLists.txt b/compilerutils/test/CMakeLists.txt index 4de3e114be..8211af2112 100644 --- a/compilerutils/test/CMakeLists.txt +++ b/compilerutils/test/CMakeLists.txt @@ -1,4 +1,4 @@ -set(COMPILERUTILS_TEST_DEPENDS cross-module-inline FileCheck count not) +set(COMPILERUTILS_TEST_DEPENDS cross-module-inline FileCheck count not opt) add_custom_target(compilerutils-test-depends DEPENDS ${COMPILERUTILS_TEST_DEPENDS}) set_target_properties(compilerutils-test-depends PROPERTIES FOLDER "Tests") diff --git a/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil b/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil new file mode 100644 index 0000000000..3eae8bf5ab --- /dev/null +++ b/compilerutils/test/dxil-to-llvm/simple-i1-vec.dxil @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -passes="dxil-to-llvm,lint" --verify-each --lint-abort-on-error -S %s | FileCheck %s +target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +declare void @use32(i32) +declare i32 @def32() +declare void @use1(i1) +declare i1 @def1() + +%simple.struct = type { <2 x i1> } +; Check that <2 x i1> is replaced by <2 x i32> in the struct: +; CHECK-NOT: %simple.struct = type { <2 x i1> } +; CHECK: %simple.struct.0 = type { <2 x i32> } +; CHECK-NOT: %simple.struct = type { <2 x i1> } + +define void @test_vec_alloca() { +; CHECK-LABEL: define {{[^@]+}}@test_vec_alloca() { +; CHECK-NEXT: [[VEC_ALLOCA:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[I1_1:%.*]] = call i1 @def1() +; CHECK-NEXT: [[I1_2:%.*]] = call i1 @def1() +; CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[I1_1]] to i32 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[I1_2]] to i32 +; CHECK-NEXT: [[VEC_12:%.*]] = insertelement <2 x i32> [[VEC1]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: store <2 x i32> [[VEC_12]], ptr [[VEC_ALLOCA]], align 8 +; CHECK-NEXT: [[VEC_I32_LOAD:%.*]] = load i32, ptr [[VEC_ALLOCA]], align 4 +; CHECK-NEXT: call void @use32(i32 [[VEC_I32_LOAD]]) +; CHECK-NEXT: ret void +; + %vec.alloca = alloca <2 x i1>, align 1 + %i1.1 = call i1 @def1() + %i1.2 = call i1 @def1() + %vec = insertelement <2 x i1> undef, i1 %i1.1, i32 0 + %vec.1 = insertelement <2 x i1> %vec, i1 %i1.2, i32 1 + store <2 x i1> %vec.1, ptr %vec.alloca + %vec.i32.load = load i32, ptr %vec.alloca, align 4 + call void @use32(i32 %vec.i32.load) + ret void +} + + +define void @test_vec_struct_alloca() { +; CHECK-LABEL: define {{[^@]+}}@test_vec_struct_alloca() { +; CHECK-NEXT: [[VEC_ALLOCA:%.*]] = alloca [[SIMPLE_STRUCT_0:%.*]], align 8 +; CHECK-NEXT: [[I1_1:%.*]] = call i1 @def1() +; CHECK-NEXT: [[I1_2:%.*]] = call i1 @def1() +; CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[I1_1]] to i32 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[I1_2]] to i32 +; CHECK-NEXT: [[VEC_12:%.*]] = insertelement <2 x i32> [[VEC1]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[STRUCT:%.*]] = insertvalue [[SIMPLE_STRUCT_0]] poison, <2 x i32> [[VEC_12]], 0 +; CHECK-NEXT: store [[SIMPLE_STRUCT_0]] [[STRUCT]], ptr [[VEC_ALLOCA]], align 8 +; CHECK-NEXT: [[VEC_I32_LOAD:%.*]] = load i32, ptr [[VEC_ALLOCA]], align 4 +; CHECK-NEXT: call void @use32(i32 [[VEC_I32_LOAD]]) +; CHECK-NEXT: ret void +; + %vec.alloca = alloca %simple.struct, align 1 + %i1.1 = call i1 @def1() + %i1.2 = call i1 @def1() + %vec = insertelement <2 x i1> undef, i1 %i1.1, i32 0 + %vec.1 = insertelement <2 x i1> %vec, i1 %i1.2, i32 1 + %struct = insertvalue %simple.struct poison, <2 x i1> %vec.1, 0 + store %simple.struct %struct, ptr %vec.alloca + %vec.i32.load = load i32, ptr %vec.alloca, align 4 + call void @use32(i32 %vec.i32.load) + ret void +} + +; Only check mutating function arguments. Mutating return types is not yet supported and isn't required for now. +define i1 @test_argument(<7 x i1> %arg) { +; CHECK-LABEL: define {{[^@]+}}@test_argument +; CHECK-SAME: (<7 x i32> [[ARG:%.*]]) { +; CHECK-NEXT: [[VAL1:%.*]] = extractelement <7 x i32> [[ARG]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[VAL1]] to i1 +; CHECK-NEXT: ret i1 [[TMP1]] +; + %val = extractelement <7 x i1> %arg, i32 3 + ret i1 %val +} + +define i1 @test_struct_gep(ptr %arg, i32 %index) { +; CHECK-LABEL: define {{[^@]+}}@test_struct_gep +; CHECK-SAME: (ptr [[ARG:%.*]], i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[PTR1:%.*]] = getelementptr [[SIMPLE_STRUCT_0:%.*]], ptr [[ARG]], i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[VEC:%.*]] = load <2 x i32>, ptr [[PTR1]], align 8 +; CHECK-NEXT: [[RES2:%.*]] = extractelement <2 x i32> [[VEC]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[RES2]] to i1 +; CHECK-NEXT: ret i1 [[TMP1]] +; + %ptr = getelementptr %simple.struct, ptr %arg, i32 %index, i32 0 + %vec = load <2 x i1>, ptr %ptr + %res = extractelement <2 x i1> %vec, i32 1 + ret i1 %res +} + +define void @test_pointee_metadata(<7 x i1>, ptr) !types !1 { +; CHECK-LABEL: define {{[^@]+}}@test_pointee_metadata +; CHECK-SAME: (<7 x i32> [[TMP0:%.*]], ptr [[TMP1:%.*]]) !types [[META2:![0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +!named = !{!10, !11} + +!1 = !{!"function", !"void", <7 x i1> poison, !2} +!2 = !{i32 0, %simple.struct poison} +!10 = !{<1 x i1> undef} +!11 = !{<3 x i1> poison} +;. +; CHECK: [[META0:![0-9]+]] = !{<1 x i32> undef} +; CHECK: [[META1:![0-9]+]] = !{<3 x i32> poison} +; CHECK: [[META2]] = !{!"function", !"void", <7 x i32> poison, [[META3:![0-9]+]]} +; CHECK: [[META3]] = !{i32 0, %simple.struct.0 poison} +;. diff --git a/compilerutils/test/lit.cfg.py b/compilerutils/test/lit.cfg.py index 93dad026a2..69460eb705 100644 --- a/compilerutils/test/lit.cfg.py +++ b/compilerutils/test/lit.cfg.py @@ -22,7 +22,7 @@ # suffixes: A list of file extensions to treat as test files. This is overridden # by individual lit.local.cfg files in the test subdirectories. -config.suffixes = ['.ll'] +config.suffixes = ['.ll', '.dxil'] # excludes: A list of directories to exclude from the testsuite. config.excludes = ['inc'] diff --git a/gfxruntime/src/shaders/AdvancedBlend.hlsl b/gfxruntime/src/shaders/AdvancedBlend.hlsl index 5bff6c24cc..e142a629f5 100644 --- a/gfxruntime/src/shaders/AdvancedBlend.hlsl +++ b/gfxruntime/src/shaders/AdvancedBlend.hlsl @@ -50,13 +50,14 @@ float4 AmdExtFragCoord() DUMMY_FLOAT4_FUNC int AmdExtSampleId() DUMMY_INT_FUNC float4 AmdAdvancedBlendTexelLoad(int64_t imageDesc, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC -float4 AmdAdvancedBlendTexelLoadFmask(int64_t imageDesc, int64_t fmaskDesc, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC +float4 AmdAdvancedBlendTexelLoadMsaa(int64_t imageDesc, int64_t fmaskDesc, int2 iCoord, int sampleId) DUMMY_FLOAT4_FUNC -float4 AmdAdvancedBlendCoherentTexelLoad(float4 color, int2 iCoord, int sampleId) DUMMY_FLOAT4_FUNC -void AmdAdvancedBlendCoherentTexelStore(float4 color, int2 iCoord, int sampleId) DUMMY_VOID_FUNC - // clang-format on +float4 AmdAdvancedBlendCoherentTexelLoad(int64_t desc, int2 iCoord, int sampleId) DUMMY_FLOAT4_FUNC +void AmdAdvancedBlendCoherentTexelStore(float4 texel, int64_t desc, int2 iCoord, int sampleId) DUMMY_VOID_FUNC + +float4 AmdAdvancedBlendCoherentTexelLoadMsaa(int64_t desc, int2 iCoord, int sampleId) DUMMY_FLOAT4_FUNC +void AmdAdvancedBlendCoherentTexelStoreMsaa(float4 texel, int64_t desc, int2 iCoord, int sampleId) DUMMY_VOID_FUNC - // clang-format off enum BlendEquationEnum { Multiply = 1, Screen, @@ -224,24 +225,7 @@ float AmdAdvancedBlendDivide(float dividend, float divisor) { } } -export float4 AmdAdvancedBlendInternal(float4 inColor, int64_t imageDescMs, int64_t imageDesc, int64_t fmaskDesc, - int mode, bool isMsaa) { - float4 srcColor = inColor; - if (mode == 0) { - return srcColor; - } - float4 fragCoord = AmdExtFragCoord(); - int2 iCoord = int2(fragCoord.x, fragCoord.y); - float4 dstColor; - if (isMsaa) { - dstColor = AmdAdvancedBlendTexelLoadFmask(imageDescMs, fmaskDesc, iCoord, 0); - } else { - dstColor = AmdAdvancedBlendTexelLoad(imageDesc, iCoord, 0); - } - // TODO: Uncomment them once ROV is support in LLPC - // int sampleId = AmdExtSampleId(); - // dstColor = AmdAdvancedBlendCoherentTexelLoad(dstColor, iCoord, sampleId); - +float4 AmdAdvancedBlending(int mode, float4 srcColor, float4 dstColor) { if (srcColor.a == 0.0f) { srcColor.r = 0.0f; srcColor.g = 0.0f; @@ -262,7 +246,7 @@ export float4 AmdAdvancedBlendInternal(float4 inColor, int64_t imageDescMs, int6 } float p0 = srcColor.a * dstColor.a; float p1 = srcColor.a * (1.0f - dstColor.a); - float p2 = (1.0f - srcColor.a) * dstColor.a; + float p2 = dstColor.a * (1.0f - srcColor.a); float4 blendingOutput; blendingOutput.r = (srcColor.r * p1) + (dstColor.r * p2); @@ -348,7 +332,47 @@ export float4 AmdAdvancedBlendInternal(float4 inColor, int64_t imageDescMs, int6 blendingOutput.r += tempColor.r * p0; blendingOutput.g += tempColor.g * p0; blendingOutput.b += tempColor.b * p0; - // AmdAdvancedBlendCoherentTexelStore(blendingOutput, iCoord, sampleId); + + return blendingOutput; +} + +export float4 AmdAdvancedBlendInternal(float4 inColor, int64_t imageDescMs, int64_t imageDesc, int64_t fmaskDesc, + int mode, int isMsaa) { + if (mode == 0) { + return inColor; + } + float4 srcColor = inColor; + float4 fragCoord = AmdExtFragCoord(); + int2 iCoord = int2(fragCoord.x, fragCoord.y); + float4 dstColor; + if (isMsaa == 1) + dstColor = AmdAdvancedBlendTexelLoadMsaa(imageDescMs, fmaskDesc, iCoord, 0); + else + dstColor = AmdAdvancedBlendTexelLoad(imageDesc, iCoord, 0); + + float4 blendingOutput = AmdAdvancedBlending(mode, srcColor, dstColor); + return blendingOutput; +} + +export float4 AmdAdvancedBlendInternalRov(float4 inColor, int64_t rovDesc, int mode, int isMsaa) { + if (mode == 0) { + return inColor; + } + + float4 fragCoord = AmdExtFragCoord(); + int2 iCoord = int2(fragCoord.x, fragCoord.y); + float4 blendingOutput; + if (isMsaa == 1) { + int sampleId = AmdExtSampleId(); + float4 dstColor = AmdAdvancedBlendCoherentTexelLoadMsaa(rovDesc, iCoord, sampleId); + blendingOutput = AmdAdvancedBlending(mode, inColor, dstColor); + AmdAdvancedBlendCoherentTexelStoreMsaa(blendingOutput, rovDesc, iCoord, sampleId); + } else { + float4 dstColor = AmdAdvancedBlendCoherentTexelLoad(rovDesc, iCoord, 0); + blendingOutput = AmdAdvancedBlending(mode, inColor, dstColor); + AmdAdvancedBlendCoherentTexelStore(blendingOutput, rovDesc, iCoord, 0); + } + return blendingOutput; } diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h index 044f96fd3c..d1bfd86971 100644 --- a/include/vkgcDefs.h +++ b/include/vkgcDefs.h @@ -228,7 +228,8 @@ enum InternalBinding : unsigned { ReverseThreadGroupControlBinding = 7, ///< Binding ID of internal buffer for reverseThreadGroup RtCaptureReplayInternalBufferBinding = 8, ///< Binding ID of ray-tracing capture replay internal buffer PixelOpInternalBinding = 9, ///< Binding ID of pixel operand image buffer. - SpecConstInternalBufferBindingId = 10, ///< Binding ID of internal buffer for specialized constant. + AdvancedBlendInternalBinding = 10, ///< Binding ID of advanced blending coherent. + SpecConstInternalBufferBindingId = 11, ///< Binding ID of internal buffer for specialized constant. SpecConstInternalBufferBindingIdEnd = SpecConstInternalBufferBindingId + ShaderStageCount, ConstantBuffer0Binding = 24, ///< Binding ID of default uniform block ConstantBuffer0BindingEnd = ConstantBuffer0Binding + ShaderStageGfxCount, @@ -696,6 +697,18 @@ enum class NggSubgroupSizingType : unsigned { /// primsPerSubgroup }; +/// Represents alpha test function +enum AlphaTestFunc : uint32_t { + Always, ///< GL_ALWAYS + Never, ///< GL_NEVER + Less, ///< GL_LESS + LEqual, ///< GL_LEQUAL + Equal, ///< GL_EQUAL + GEqual, ///< GL_GEQUAL + Greater, ///< GL_GREATER + NotEqual, ///< GL_NOTEQUAL +}; + /// Represents NGG tuning options struct NggState { bool enableNgg; ///< Enable NGG mode, use an implicit primitive shader @@ -1214,9 +1227,6 @@ struct ApiXfbOutData { XfbOutInfo *pXfbOutInfos; ///< An array of XfbOutInfo items unsigned numXfbOutInfo; ///< Count of XfbOutInfo items bool forceDisableStreamOut; ///< Force to disable stream out XFB outputs -#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 70 - bool forceEnablePrimStats; ///< Force to enable counting generated primitives -#endif }; /// Represents the tessellation level passed from driver API @@ -1227,6 +1237,7 @@ struct TessellationLevel { struct AdvancedBlendInfo { bool enableAdvancedBlend; ///< Whether enable advanced blending + bool enableRov; ///< Whether enable advanced blending with raster-order-view support unsigned binding; ///< The binding point of the texture resource attached to the framebuffer }; @@ -1342,6 +1353,8 @@ struct GraphicsPipelineBuildInfo { bool enableFlatShade; ///< Whether enable flat shade. float lineSmooth[4]; ///< Line smooth pattern float pointSmooth[2]; ///< Point smooth pattern + bool enableMapClipDistMask; ///< Whether to remap the clip distances. + AlphaTestFunc alphaTestFunc; ///< AlphaTestFunc type } glState; const auto &getGlState() const { return glState; } #endif diff --git a/lgc/CMakeLists.txt b/lgc/CMakeLists.txt index 10080aac5c..cf3417aaae 100644 --- a/lgc/CMakeLists.txt +++ b/lgc/CMakeLists.txt @@ -143,6 +143,7 @@ target_sources(LLVMlgc PRIVATE patch/FragColorExport.cpp patch/LowerDebugPrintf.cpp patch/LowerDesc.cpp + patch/LowerPopsInterlock.cpp patch/LowerSubgroupOps.cpp patch/MeshTaskShader.cpp patch/NggPrimShader.cpp @@ -173,6 +174,9 @@ target_sources(LLVMlgc PRIVATE patch/VertexFetch.cpp patch/PatchImageOpCollect.cpp patch/RegisterMetadataBuilder.cpp +#if VKI_BUILD_STRIX1 + patch/WorkaroundDsSubdwordWrite.cpp +#endif patch/CombineCooperativeMatrix.cpp patch/LowerCooperativeMatrix.cpp patch/LowerGpuRt.cpp @@ -207,6 +211,7 @@ target_sources(LLVMlgc PRIVATE util/ModuleBunch.cpp util/PassManager.cpp util/StartStopTimer.cpp + util/WorkgroupLayout.cpp ) # lgc/interface/lgc diff --git a/lgc/builder/ArithBuilder.cpp b/lgc/builder/ArithBuilder.cpp index fee6b1a564..a9c307efe8 100644 --- a/lgc/builder/ArithBuilder.cpp +++ b/lgc/builder/ArithBuilder.cpp @@ -91,6 +91,8 @@ Value *BuilderImpl::CreateCubeFaceIndex(Value *coord, const Twine &instName) { // @param instName : Name to give instruction(s) Value *BuilderImpl::CreateFpTruncWithRounding(Value *value, Type *destTy, RoundingMode roundingMode, const Twine &instName) { + assert(!destTy->getScalarType()->isBFloatTy() && "HW doesn't support roundingMode instrunctions for BFloat16"); + if (value->getType()->getScalarType()->isDoubleTy()) value = CreateFPTrunc(value, BuilderBase::getConditionallyVectorizedTy(getFloatTy(), destTy)); @@ -869,6 +871,13 @@ Value *BuilderImpl::CreateFaceForward(Value *n, Value *i, Value *nref, const Twi // @param n : Input value "N" // @param instName : Name to give instruction(s) Value *BuilderImpl::CreateReflect(Value *i, Value *n, const Twine &instName) { + // The reflect function is defined as: reflect(I, N) = I - 2 * dot(I, N) * N + // For scalars this is: reflect(I, N) = I - 2 * (I * N) * N + // Applying reassociation could transform (I * N) * N into I * (N * N) + // which can cause spurious overflow or underflow if N is very large or very small. + // Prevent this by disabling reassociation. + getFastMathFlags().setAllowReassoc(false); + Value *dot = CreateDotProduct(n, i); dot = CreateFMul(dot, ConstantFP::get(dot->getType(), 2.0)); if (auto vecTy = dyn_cast(n->getType())) diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp index 0177fcd2be..088f655ed2 100644 --- a/lgc/builder/BuilderImpl.cpp +++ b/lgc/builder/BuilderImpl.cpp @@ -74,6 +74,38 @@ Type *BuilderBase::getConditionallyVectorizedTy(Type *elementTy, Type *maybeVecT // @param vector2 : The float vector 2 // @param instName : Name to give instruction(s) Value *BuilderImpl::CreateDotProduct(Value *const vector1, Value *const vector2, const Twine &instName) { + if (vector1->getType()->getScalarType()->isBFloatTy()) { + assert(getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 11); + // amdgcn_fdot2_bf16_bf16 will be used. + const auto fp16RoundMode = + getPipelineState()->getShaderModes()->getCommonShaderMode(m_shaderStage.value()).fp16RoundMode; + const auto vectorTy = dyn_cast(vector1->getType()); + if (vectorTy && (fp16RoundMode == FpRoundMode::DontCare || fp16RoundMode == FpRoundMode::Even)) { + int compCount = vectorTy->getNumElements(); + Value *result = nullptr; + + if (compCount % 2 == 0) { + // If all products are of the form +x * -0.0, then the result should be -0.0. This requires a -0.0 + // initial value. + // + // However, we prefer +0.0 as initial value when signed zeros are disabled because it can be encoded as an + // inline constant. + result = ConstantFP::get(getBFloatTy(), getFastMathFlags().noSignedZeros() ? +0.0 : -0.0); + } else { + // If the component count is odd, prefer feeding the last product (odd one out) as initial value. + Value *lhs = CreateExtractElement(vector1, compCount - 1); + Value *rhs = CreateExtractElement(vector2, compCount - 1); + result = CreateFMul(lhs, rhs); + } + + for (int i = 0; i + 1 < compCount; i += 2) { + Value *lhs = CreateShuffleVector(vector1, {i, i + 1}); + Value *rhs = CreateShuffleVector(vector2, {i, i + 1}); + result = CreateIntrinsic(getBFloatTy(), Intrinsic::amdgcn_fdot2_bf16_bf16, {lhs, rhs, result}); + } + return result; + } + } Value *product = CreateFMul(vector1, vector2); if (!isa(product->getType())) diff --git a/lgc/builder/BuilderRecorder.cpp b/lgc/builder/BuilderRecorder.cpp index d143ae8373..280838a6f3 100644 --- a/lgc/builder/BuilderRecorder.cpp +++ b/lgc/builder/BuilderRecorder.cpp @@ -2046,11 +2046,16 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRefsetDoesNotAccessMemory(); break; + case BuilderOpcode::ImageSample: + case BuilderOpcode::ImageSampleConvert: + // Function read and write memory if return is void. + if (!resultTy || resultTy->isVoidTy()) + break; + // Otherwise treat as read only; intentional fall-through. + LLVM_FALLTHROUGH; case BuilderOpcode::ImageGather: case BuilderOpcode::ImageLoad: case BuilderOpcode::ImageLoadWithFmask: - case BuilderOpcode::ImageSample: - case BuilderOpcode::ImageSampleConvert: case BuilderOpcode::LoadPushConstantsPtr: case BuilderOpcode::ReadBaryCoord: case BuilderOpcode::ReadBuiltInInput: diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp index edead69449..e24421b9f0 100644 --- a/lgc/builder/ImageBuilder.cpp +++ b/lgc/builder/ImageBuilder.cpp @@ -324,6 +324,155 @@ static const IntrinsicTableEntry ImageSampleIntrinsicTable[] = { Intrinsic::amdgcn_image_sample_o_2darray}}, {0}}; +// No return image sample intrinsics. +// Should mirror the table for normal image sample intrinsics. +static const IntrinsicTableEntry ImageSampleNoReturnIntrinsicTable[] = { + {(1U << Builder::ImageAddressIdxCoordinate), + { + Intrinsic::amdgcn_image_sample_1d_nortn, + Intrinsic::amdgcn_image_sample_2d_nortn, + Intrinsic::amdgcn_image_sample_3d_nortn, + Intrinsic::amdgcn_image_sample_cube_nortn, + Intrinsic::amdgcn_image_sample_1darray_nortn, + Intrinsic::amdgcn_image_sample_2darray_nortn, + }}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLodBias), + {Intrinsic::amdgcn_image_sample_b_1d_nortn, Intrinsic::amdgcn_image_sample_b_2d_nortn, + Intrinsic::amdgcn_image_sample_b_3d_nortn, Intrinsic::amdgcn_image_sample_b_cube_nortn, + Intrinsic::amdgcn_image_sample_b_1darray_nortn, Intrinsic::amdgcn_image_sample_b_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLodBias) | + (1U << Builder::ImageAddressIdxLodClamp), + {Intrinsic::amdgcn_image_sample_b_cl_1d_nortn, Intrinsic::amdgcn_image_sample_b_cl_2d_nortn, + Intrinsic::amdgcn_image_sample_b_cl_3d_nortn, Intrinsic::amdgcn_image_sample_b_cl_cube_nortn, + Intrinsic::amdgcn_image_sample_b_cl_1darray_nortn, Intrinsic::amdgcn_image_sample_b_cl_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLodBias) | + (1U << Builder::ImageAddressIdxLodClamp), + {Intrinsic::amdgcn_image_sample_b_cl_o_1d_nortn, Intrinsic::amdgcn_image_sample_b_cl_o_2d_nortn, + Intrinsic::amdgcn_image_sample_b_cl_o_3d_nortn, Intrinsic::amdgcn_image_sample_b_cl_o_cube_nortn, + Intrinsic::amdgcn_image_sample_b_cl_o_1darray_nortn, Intrinsic::amdgcn_image_sample_b_cl_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLodBias) | + (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_b_o_1d_nortn, Intrinsic::amdgcn_image_sample_b_o_2d_nortn, + Intrinsic::amdgcn_image_sample_b_o_3d_nortn, Intrinsic::amdgcn_image_sample_b_o_cube_nortn, + Intrinsic::amdgcn_image_sample_b_o_1darray_nortn, Intrinsic::amdgcn_image_sample_b_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare), + {Intrinsic::amdgcn_image_sample_c_1d_nortn, Intrinsic::amdgcn_image_sample_c_2d_nortn, + Intrinsic::amdgcn_image_sample_c_3d_nortn, Intrinsic::amdgcn_image_sample_c_cube_nortn, + Intrinsic::amdgcn_image_sample_c_1darray_nortn, Intrinsic::amdgcn_image_sample_c_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxLodBias), + {Intrinsic::amdgcn_image_sample_c_b_1d_nortn, Intrinsic::amdgcn_image_sample_c_b_2d_nortn, + Intrinsic::amdgcn_image_sample_c_b_3d_nortn, Intrinsic::amdgcn_image_sample_c_b_cube_nortn, + Intrinsic::amdgcn_image_sample_c_b_1darray_nortn, Intrinsic::amdgcn_image_sample_c_b_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxLodBias) | (1U << Builder::ImageAddressIdxLodClamp), + {Intrinsic::amdgcn_image_sample_c_b_cl_1d_nortn, Intrinsic::amdgcn_image_sample_c_b_cl_2d_nortn, + Intrinsic::amdgcn_image_sample_c_b_cl_3d_nortn, Intrinsic::amdgcn_image_sample_c_b_cl_cube_nortn, + Intrinsic::amdgcn_image_sample_c_b_cl_1darray_nortn, Intrinsic::amdgcn_image_sample_c_b_cl_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxLodBias) | (1U << Builder::ImageAddressIdxLodClamp) | + (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_c_b_cl_o_1d_nortn, Intrinsic::amdgcn_image_sample_c_b_cl_o_2d_nortn, + Intrinsic::amdgcn_image_sample_c_b_cl_o_3d_nortn, Intrinsic::amdgcn_image_sample_c_b_cl_o_cube_nortn, + Intrinsic::amdgcn_image_sample_c_b_cl_o_1darray_nortn, Intrinsic::amdgcn_image_sample_c_b_cl_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxLodBias) | (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_c_b_o_1d_nortn, Intrinsic::amdgcn_image_sample_c_b_o_2d_nortn, + Intrinsic::amdgcn_image_sample_c_b_o_3d_nortn, Intrinsic::amdgcn_image_sample_c_b_o_cube_nortn, + Intrinsic::amdgcn_image_sample_c_b_o_1darray_nortn, Intrinsic::amdgcn_image_sample_c_b_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxLodClamp), + {Intrinsic::amdgcn_image_sample_c_cl_1d_nortn, Intrinsic::amdgcn_image_sample_c_cl_2d_nortn, + Intrinsic::amdgcn_image_sample_c_cl_3d_nortn, Intrinsic::amdgcn_image_sample_c_cl_cube_nortn, + Intrinsic::amdgcn_image_sample_c_cl_1darray_nortn, Intrinsic::amdgcn_image_sample_c_cl_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxLodClamp) | (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_c_cl_o_1d_nortn, Intrinsic::amdgcn_image_sample_c_cl_o_2d_nortn, + Intrinsic::amdgcn_image_sample_c_cl_o_3d_nortn, Intrinsic::amdgcn_image_sample_c_cl_o_cube_nortn, + Intrinsic::amdgcn_image_sample_c_cl_o_1darray_nortn, Intrinsic::amdgcn_image_sample_c_cl_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxDerivativeX) | (1U << Builder::ImageAddressIdxDerivativeY), + {Intrinsic::amdgcn_image_sample_c_d_1d_nortn, Intrinsic::amdgcn_image_sample_c_d_2d_nortn, + Intrinsic::amdgcn_image_sample_c_d_3d_nortn, Intrinsic::amdgcn_image_sample_c_d_cube_nortn, + Intrinsic::amdgcn_image_sample_c_d_1darray_nortn, Intrinsic::amdgcn_image_sample_c_d_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxDerivativeX) | (1U << Builder::ImageAddressIdxDerivativeY) | + (1U << Builder::ImageAddressIdxLodClamp), + {Intrinsic::amdgcn_image_sample_c_d_cl_1d_nortn, Intrinsic::amdgcn_image_sample_c_d_cl_2d_nortn, + Intrinsic::amdgcn_image_sample_c_d_cl_3d_nortn, Intrinsic::amdgcn_image_sample_c_d_cl_cube_nortn, + Intrinsic::amdgcn_image_sample_c_d_cl_1darray_nortn, Intrinsic::amdgcn_image_sample_c_d_cl_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxDerivativeX) | (1U << Builder::ImageAddressIdxDerivativeY) | + (1U << Builder::ImageAddressIdxLodClamp) | (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_c_d_cl_o_1d_nortn, Intrinsic::amdgcn_image_sample_c_d_cl_o_2d_nortn, + Intrinsic::amdgcn_image_sample_c_d_cl_o_3d_nortn, Intrinsic::amdgcn_image_sample_c_d_cl_o_cube_nortn, + Intrinsic::amdgcn_image_sample_c_d_cl_o_1darray_nortn, Intrinsic::amdgcn_image_sample_c_d_cl_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxDerivativeX) | + (1U << Builder::ImageAddressIdxDerivativeY) | (1U << Builder::ImageAddressIdxZCompare) | + (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_c_d_o_1d_nortn, Intrinsic::amdgcn_image_sample_c_d_o_2d_nortn, + Intrinsic::amdgcn_image_sample_c_d_o_3d_nortn, Intrinsic::amdgcn_image_sample_c_d_o_cube_nortn, + Intrinsic::amdgcn_image_sample_c_d_o_1darray_nortn, Intrinsic::amdgcn_image_sample_c_d_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLod) | + (1U << Builder::ImageAddressIdxZCompare), + {Intrinsic::amdgcn_image_sample_c_l_1d_nortn, Intrinsic::amdgcn_image_sample_c_l_2d_nortn, + Intrinsic::amdgcn_image_sample_c_l_3d_nortn, Intrinsic::amdgcn_image_sample_c_l_cube_nortn, + Intrinsic::amdgcn_image_sample_c_l_1darray_nortn, Intrinsic::amdgcn_image_sample_c_l_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxOffset) | + (1U << Builder::ImageAddressIdxLod) | (1U << Builder::ImageAddressIdxZCompare), + {Intrinsic::amdgcn_image_sample_c_l_o_1d_nortn, Intrinsic::amdgcn_image_sample_c_l_o_2d_nortn, + Intrinsic::amdgcn_image_sample_c_l_o_3d_nortn, Intrinsic::amdgcn_image_sample_c_l_o_cube_nortn, + Intrinsic::amdgcn_image_sample_c_l_o_1darray_nortn, Intrinsic::amdgcn_image_sample_c_l_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxOffset) | + (1U << Builder::ImageAddressIdxZCompare), + {Intrinsic::amdgcn_image_sample_c_o_1d_nortn, Intrinsic::amdgcn_image_sample_c_o_2d_nortn, + Intrinsic::amdgcn_image_sample_c_o_3d_nortn, Intrinsic::amdgcn_image_sample_c_o_cube_nortn, + Intrinsic::amdgcn_image_sample_c_o_1darray_nortn, Intrinsic::amdgcn_image_sample_c_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLodClamp), + {Intrinsic::amdgcn_image_sample_cl_1d_nortn, Intrinsic::amdgcn_image_sample_cl_2d_nortn, + Intrinsic::amdgcn_image_sample_cl_3d_nortn, Intrinsic::amdgcn_image_sample_cl_cube_nortn, + Intrinsic::amdgcn_image_sample_cl_1darray_nortn, Intrinsic::amdgcn_image_sample_cl_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLodClamp) | + (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_cl_o_1d_nortn, Intrinsic::amdgcn_image_sample_cl_o_2d_nortn, + Intrinsic::amdgcn_image_sample_cl_o_3d_nortn, Intrinsic::amdgcn_image_sample_cl_o_cube_nortn, + Intrinsic::amdgcn_image_sample_cl_o_1darray_nortn, Intrinsic::amdgcn_image_sample_cl_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxDerivativeX) | + (1U << Builder::ImageAddressIdxDerivativeY), + {Intrinsic::amdgcn_image_sample_d_1d_nortn, Intrinsic::amdgcn_image_sample_d_2d_nortn, + Intrinsic::amdgcn_image_sample_d_3d_nortn, Intrinsic::amdgcn_image_sample_d_cube_nortn, + Intrinsic::amdgcn_image_sample_d_1darray_nortn, Intrinsic::amdgcn_image_sample_d_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxDerivativeX) | + (1U << Builder::ImageAddressIdxDerivativeY) | (1U << Builder::ImageAddressIdxLodClamp), + {Intrinsic::amdgcn_image_sample_d_cl_1d_nortn, Intrinsic::amdgcn_image_sample_d_cl_2d_nortn, + Intrinsic::amdgcn_image_sample_d_cl_3d_nortn, Intrinsic::amdgcn_image_sample_d_cl_cube_nortn, + Intrinsic::amdgcn_image_sample_d_cl_1darray_nortn, Intrinsic::amdgcn_image_sample_d_cl_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxDerivativeX) | + (1U << Builder::ImageAddressIdxDerivativeY) | (1U << Builder::ImageAddressIdxLodClamp) | + (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_d_cl_o_1d_nortn, Intrinsic::amdgcn_image_sample_d_cl_o_2d_nortn, + Intrinsic::amdgcn_image_sample_d_cl_o_3d_nortn, Intrinsic::amdgcn_image_sample_d_cl_o_cube_nortn, + Intrinsic::amdgcn_image_sample_d_cl_o_1darray_nortn, Intrinsic::amdgcn_image_sample_d_cl_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxDerivativeX) | + (1U << Builder::ImageAddressIdxDerivativeY) | (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_d_o_1d_nortn, Intrinsic::amdgcn_image_sample_d_o_2d_nortn, + Intrinsic::amdgcn_image_sample_d_o_3d_nortn, Intrinsic::amdgcn_image_sample_d_o_cube_nortn, + Intrinsic::amdgcn_image_sample_d_o_1darray_nortn, Intrinsic::amdgcn_image_sample_d_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLod), + {Intrinsic::amdgcn_image_sample_l_1d_nortn, Intrinsic::amdgcn_image_sample_l_2d_nortn, + Intrinsic::amdgcn_image_sample_l_3d_nortn, Intrinsic::amdgcn_image_sample_l_cube_nortn, + Intrinsic::amdgcn_image_sample_l_1darray_nortn, Intrinsic::amdgcn_image_sample_l_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxLod) | + (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_l_o_1d_nortn, Intrinsic::amdgcn_image_sample_l_o_2d_nortn, + Intrinsic::amdgcn_image_sample_l_o_3d_nortn, Intrinsic::amdgcn_image_sample_l_o_cube_nortn, + Intrinsic::amdgcn_image_sample_l_o_1darray_nortn, Intrinsic::amdgcn_image_sample_l_o_2darray_nortn}}, + {(1U << Builder::ImageAddressIdxCoordinate) | (1U << Builder::ImageAddressIdxOffset), + {Intrinsic::amdgcn_image_sample_o_1d_nortn, Intrinsic::amdgcn_image_sample_o_2d_nortn, + Intrinsic::amdgcn_image_sample_o_3d_nortn, Intrinsic::amdgcn_image_sample_o_cube_nortn, + Intrinsic::amdgcn_image_sample_o_1darray_nortn, Intrinsic::amdgcn_image_sample_o_2darray_nortn}}, + {0}}; + // Intrinsic ID table for struct buffer atomic static const Intrinsic::ID StructBufferAtomicIntrinsicTable[] = { Intrinsic::amdgcn_struct_buffer_atomic_swap, Intrinsic::amdgcn_struct_buffer_atomic_cmpswap, @@ -1000,7 +1149,8 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign // Build the intrinsic arguments and overloaded types. SmallVector args; SmallVector overloadTys; - overloadTys.push_back(resultTy); + if (resultTy && !resultTy->isVoidTy()) + overloadTys.push_back(resultTy); // Dmask. unsigned dmask = 15; @@ -1089,7 +1239,9 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign args.push_back(getInt32(coherent.u32All)); // Search the intrinsic ID table. - auto table = isSample ? &ImageSampleIntrinsicTable[0] : &ImageGather4IntrinsicTable[0]; + auto table = isSample ? ((!resultTy || resultTy->isVoidTy()) ? &ImageSampleNoReturnIntrinsicTable[0] + : &ImageSampleIntrinsicTable[0]) + : &ImageGather4IntrinsicTable[0]; for (;; ++table) { assert(table->matchMask != 0 && "Image sample/gather intrinsic ID not found"); if (table->matchMask == addressMask) diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp index 0d1101b659..9502c6af2e 100644 --- a/lgc/builder/InOutBuilder.cpp +++ b/lgc/builder/InOutBuilder.cpp @@ -424,36 +424,7 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location, } if (!(m_shaderStage == ShaderStage::Geometry && isOutput)) { - // Not GS output - bool keepAllLocations = false; - if (getPipelineState()->isUnlinked()) { - if (isOutput) { - // Keep all locations if the next stage of the output is fragment shader or is unspecified - if (m_shaderStage != ShaderStage::Fragment) { - auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value()); - keepAllLocations = nextStage == ShaderStage::Fragment || !nextStage; - } - } else { - // Keep all locations if it is the input of fragment shader - keepAllLocations = m_shaderStage == ShaderStage::Fragment; - } - } - if (inOutLocInfoMap) { - // Handle per-vertex input/output - if (keepAllLocations) { - // If keeping all locations, add location map entries whose locations are before this input/output - for (unsigned i = 0; i < location; ++i) { - InOutLocationInfo origLocationInfo; - origLocationInfo.setLocation(i); - if (inOutLocInfoMap->count(origLocationInfo) == 0) { - // Add this location map entry only if it doesn't exist - auto &newLocationInfo = (*inOutLocInfoMap)[origLocationInfo]; - newLocationInfo.setData(InvalidValue); - } - } - } - // Add location map entries for this input/output // NOTE: For TCS input/output, TES input, and mesh shader output, their components could be separately indexed. @@ -507,16 +478,6 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location, } if (perPatchInOutLocMap) { - // Handle per-patch input/output - if (keepAllLocations) { - // If keeping all locations, add location map entries whose locations are before this input/output - for (unsigned i = 0; i < location; ++i) { - // Add this location map entry only if it doesn't exist - if (perPatchInOutLocMap->count(i) == 0) - (*perPatchInOutLocMap)[i] = InvalidValue; - } - } - // Add location map entries for this input/output for (unsigned i = 0; i < locationCount; ++i) (*perPatchInOutLocMap)[location + i] = @@ -525,16 +486,6 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location, } if (perPrimitiveInOutLocMap) { - // Handle per-primitive input/output - if (keepAllLocations) { - // If keeping all locations, add location map entries whose locations are before this input/output - for (unsigned i = 0; i < location; ++i) { - // Add this location map entry only if it doesn't exist - if (perPrimitiveInOutLocMap->count(i) == 0) - (*perPrimitiveInOutLocMap)[i] = InvalidValue; - } - } - // Add location map entries for this input/output for (unsigned i = 0; i < locationCount; ++i) (*perPrimitiveInOutLocMap)[location + i] = @@ -1453,7 +1404,7 @@ Value *BuilderImpl::readCsBuiltIn(BuiltInKind builtIn, const Twine &instName) { localInvocationId = CreateInsertElement(localInvocationId, getInt32(0), 2); } - if (m_shaderStage == ShaderStage::Compute) { + if (m_shaderStage == ShaderStage::Compute || m_shaderStage == ShaderStage::Task) { // Reconfigure the workgroup layout later if it's necessary. if (!getPipelineState()->isComputeLibrary()) { // Insert a call that later on might get lowered to code to reconfigure the workgroup. diff --git a/lgc/builder/MatrixBuilder.cpp b/lgc/builder/MatrixBuilder.cpp index a21161c04b..e3ed620d3e 100644 --- a/lgc/builder/MatrixBuilder.cpp +++ b/lgc/builder/MatrixBuilder.cpp @@ -358,6 +358,7 @@ Type *BuilderCommon::transCooperativeMatrixElementType(CooperativeMatrixElementT case CooperativeMatrixElementType::Float32: return getFloatTy(); case CooperativeMatrixElementType::Int16: + case CooperativeMatrixElementType::BFloat16: return getInt16Ty(); case CooperativeMatrixElementType::Int32: return getInt32Ty(); @@ -391,3 +392,29 @@ Type *BuilderCommon::getCooperativeMatrixTy(CooperativeMatrixElementType elemTyp llvm_unreachable("Type is not supported!"); } } + +// ===================================================================================================================== +// Whether the type of a cooperative matrix is specified bit width. +// +// @param elemType : the matrix element type +// @param bitWidth : the specified bit width +bool BuilderCommon::isTypeNCooperativeMatrix(CooperativeMatrixElementType elemType, unsigned bitWidth) { + unsigned width = 0; + switch (elemType) { + case lgc::CooperativeMatrixElementType::Float16: + case lgc::CooperativeMatrixElementType::BFloat16: + case lgc::CooperativeMatrixElementType::Int16: + width = 16; + break; + case lgc::CooperativeMatrixElementType::Float32: + case lgc::CooperativeMatrixElementType::Int32: + width = 32; + break; + case lgc::CooperativeMatrixElementType::Int8: + width = 8; + break; + default: + break; + } + return width == bitWidth; +} diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp index c62acd661c..65490d9861 100644 --- a/lgc/builder/SubgroupBuilder.cpp +++ b/lgc/builder/SubgroupBuilder.cpp @@ -707,21 +707,17 @@ Value *BuilderImpl::CreateSubgroupClusteredInclusive(GroupArithOp groupArithOp, createDppUpdate(identity, result, DppCtrl::DppRowSr8, 0xF, 0xC, 0)); } - Value *const threadMask = createThreadMask(); - if (clusterSize >= 32) { - Value *const maskedPermLane = - createThreadMaskedSelect(threadMask, 0xFFFF0000FFFF0000, - createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false), identity); + Value *permLaneX = createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false); + Value *const maskedPermLane = createInverseBallotSelect(0xFFFF0000FFFF0000, permLaneX, identity); + // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). result = createGroupArithmeticOperation(groupArithOp, result, maskedPermLane); } if (clusterSize == 64) { - Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName); - - Value *const maskedBroadcast = createThreadMaskedSelect(threadMask, 0xFFFFFFFF00000000, broadcast31, identity); + Value *const maskedBroadcast = createInverseBallotSelect(0xFFFFFFFF00000000, broadcast31, identity); // Combine broadcast of 31 with the top two rows only. result = createGroupArithmeticOperation(groupArithOp, result, maskedBroadcast); @@ -767,8 +763,6 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, Value *shiftRight = nullptr; - Value *const threadMask = createThreadMask(); - // Shift right within each row: // 0b0110,0101,0100,0011,0010,0001,0000,1111 = 0x6543210F // 0b1110,1101,1100,1011,1010,1001,1000,0111 = 0xEDCBA987 @@ -785,9 +779,8 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, // Exchange first column value cross rows(row 1<--> row 0, row 3<-->row2) // Only first column value from each row join permlanex - shiftRight = - createThreadMaskedSelect(threadMask, 0x0001000100010001, - createPermLaneX16(shiftRight, shiftRight, 0, UINT32_MAX, true, false), shiftRight); + Value *permLaneX = createPermLaneX16(shiftRight, shiftRight, 0, UINT32_MAX, true, false); + shiftRight = createInverseBallotSelect(0x0001000100010001, permLaneX, shiftRight); if (clusterSize >= 2) { // The DPP operation has all rows active and all banks in the rows active (0xF). @@ -820,9 +813,8 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, } if (clusterSize >= 32) { - Value *const maskedPermLane = - createThreadMaskedSelect(threadMask, 0xFFFF0000FFFF0000, - createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false), identity); + Value *permLaneX = createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false); + Value *const maskedPermLane = createInverseBallotSelect(0xFFFF0000FFFF0000, permLaneX, identity); // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). result = createGroupArithmeticOperation(groupArithOp, result, maskedPermLane); @@ -830,8 +822,7 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, if (clusterSize >= 64) { Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName); - - Value *const maskedBroadcast = createThreadMaskedSelect(threadMask, 0xFFFFFFFF00000000, broadcast31, identity); + Value *const maskedBroadcast = createInverseBallotSelect(0xFFFFFFFF00000000, broadcast31, identity); // Combine broadcast of 31 with the top two rows only. result = createGroupArithmeticOperation(groupArithOp, result, maskedBroadcast); @@ -901,9 +892,6 @@ Value *BuilderImpl::CreateSubgroupClusteredMultiExclusive(GroupArithOp groupArit { previousLaneValue = CreateSubgroupShuffle(result, previousLaneIndex, instName); } // Don't accumulate if there is no valid lane found in previous cluster or current lane is no need for accumulate. -#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 479645 - Value *isAccumulateLane = CreateICmpNE(CreateAnd(laneIndex, getInt32(clusterSize)), getInt32(0)); -#else // TODO: Check amdgcn_inverse_ballot version. const unsigned long long halfMasks[] = { 0x5555555555555555ull, 0x3333333333333333ull, 0x0f0f0f0f0f0f0f0full, @@ -912,7 +900,6 @@ Value *BuilderImpl::CreateSubgroupClusteredMultiExclusive(GroupArithOp groupArit Value *isAccumulateLane = CreateIntrinsic(getInt1Ty(), Intrinsic::amdgcn_inverse_ballot, ConstantInt::get(clusterMask->getType(), ~halfMasks[log2ClusterSize])); -#endif previousLaneValue = CreateSelect(CreateAnd(isAccumulateLane, isPreviousLaneValid), previousLaneValue, identity); result = createGroupArithmeticOperation(groupArithOp, result, previousLaneValue); @@ -1414,6 +1401,23 @@ Value *BuilderImpl::createThreadMaskedSelect(Value *const threadMask, uint64_t a return CreateSelect(CreateICmpNE(CreateAnd(threadMask, andMaskVal), zero), value1, value2); } +// ===================================================================================================================== +// Create a masked operation - using inverse ballot select between the first value and the second value if the current +// thread is active. +// +// @param selectMask : The lane select mask. +// @param value1 : The first value to select. +// @param value2 : The second value to select. +Value *BuilderImpl::createInverseBallotSelect(uint64_t selectMask, Value *const value1, Value *const value2) { + CallInst *inverseBallot = + (getShaderWaveSize() == 64) + ? CreateIntrinsic(Intrinsic::amdgcn_inverse_ballot, getInt64Ty(), getInt64(selectMask)) + : CreateIntrinsic(Intrinsic::amdgcn_inverse_ballot, getInt32Ty(), getInt32(selectMask & 0xffffffff)); + // Add "convergent" to avoid IR optimization to merge this intrinsic out. + inverseBallot->addFnAttr(Attribute::Convergent); + return CreateSelect(inverseBallot, value1, value2); +} + // ===================================================================================================================== // Do group ballot, turning a per-lane boolean value (in a VGPR) into a subgroup-wide shared SGPR. // diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h index 09553be177..b1fd73c612 100644 --- a/lgc/include/lgc/builder/BuilderImpl.h +++ b/lgc/include/lgc/builder/BuilderImpl.h @@ -786,6 +786,7 @@ class BuilderImpl : public BuilderDefs { llvm::Value *createThreadMask(); llvm::Value *createThreadMaskedSelect(llvm::Value *const threadMask, uint64_t andMask, llvm::Value *const value1, llvm::Value *const value2); + llvm::Value *createInverseBallotSelect(uint64_t selectMask, llvm::Value *const value1, llvm::Value *const value2); uint16_t getDsSwizzleBitMode(uint8_t xorMask, uint8_t orMask, uint8_t andMask); uint16_t getDsSwizzleQuadMode(uint8_t lane0, uint8_t lane1, uint8_t lane2, uint8_t lane3); diff --git a/lgc/include/lgc/patch/LowerDebugPrintf.h b/lgc/include/lgc/patch/LowerDebugPrintf.h index 9428eac9a3..2f198ec90f 100644 --- a/lgc/include/lgc/patch/LowerDebugPrintf.h +++ b/lgc/include/lgc/patch/LowerDebugPrintf.h @@ -41,6 +41,7 @@ namespace lgc { class DebugPrintfOp; +struct ResourceNode; // ===================================================================================================================== // Pass to lower debug.printf calls @@ -63,6 +64,7 @@ class LowerDebugPrintf : public llvm::PassInfoMixin { llvm::SmallVector m_toErase; llvm::Value *m_debugPrintfBuffer = nullptr; PipelineState *m_pipelineState = nullptr; + const ResourceNode *m_topNode = nullptr; }; } // namespace lgc diff --git a/lgc/include/lgc/patch/LowerGpuRt.h b/lgc/include/lgc/patch/LowerGpuRt.h index de44bec437..5876d01cd7 100644 --- a/lgc/include/lgc/patch/LowerGpuRt.h +++ b/lgc/include/lgc/patch/LowerGpuRt.h @@ -61,6 +61,7 @@ class GpurtContinuationStackIsGlobalOp; class GpurtWaveScanOp; class GpurtGetKnownSetRayFlagsOp; class GpurtGetKnownUnsetRayFlagsOp; +class GpurtInitStaticIdOp; class LowerGpuRt : public llvm::PassInfoMixin { public: @@ -91,11 +92,13 @@ class LowerGpuRt : public llvm::PassInfoMixin { void visitWaveScanOp(lgc::GpurtWaveScanOp &inst); void visitGetKnownSetRayFlagsOp(lgc::GpurtGetKnownSetRayFlagsOp &inst); void visitGetKnownUnsetRayFlagsOp(lgc::GpurtGetKnownUnsetRayFlagsOp &inst); + void visitInitStaticId(lgc::GpurtInitStaticIdOp &inst); llvm::Value *m_stack = nullptr; // Stack array to hold stack value llvm::Type *m_stackTy = nullptr; // Stack type PipelineState *m_pipelineState = nullptr; // Pipeline state llvm::SmallVector m_callsToLower; // Call instruction to lower llvm::SmallSet m_funcsToLower; // Functions to lower Builder *m_builder = nullptr; + unsigned m_rayStaticId = 0; }; } // namespace lgc diff --git a/lgc/include/lgc/patch/PatchBufferOp.h b/lgc/include/lgc/patch/PatchBufferOp.h index 332d5bdd5e..a1feacdd79 100644 --- a/lgc/include/lgc/patch/PatchBufferOp.h +++ b/lgc/include/lgc/patch/PatchBufferOp.h @@ -137,7 +137,8 @@ class BufferOpLowering { llvm::Instruction *makeLoop(llvm::Value *const loopStart, llvm::Value *const loopEnd, llvm::Value *const loopStride, llvm::Instruction *const insertPos); llvm::Value *createGlobalPointerAccess(llvm::Value *const bufferDesc, llvm::Value *const offset, - llvm::Type *const type, llvm::Instruction &inst, + llvm::Value *const strideIndex, llvm::Type *const type, + llvm::Instruction &inst, const llvm::function_ref callback); llvm::Value *createCompactDesc(llvm::Value *const buffAddress, llvm::Value *const stride); llvm::Value *createLoadDesc(llvm::Value *buffAddress, bool forceRawView, bool isCompact); diff --git a/lgc/include/lgc/patch/PatchEntryPointMutate.h b/lgc/include/lgc/patch/PatchEntryPointMutate.h index 70b83baa65..e56338b3ef 100644 --- a/lgc/include/lgc/patch/PatchEntryPointMutate.h +++ b/lgc/include/lgc/patch/PatchEntryPointMutate.h @@ -172,10 +172,6 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin m_importedOutputCalls; // The output import calls std::vector m_inputCalls; // The input import calls std::vector m_outputCalls; // The output export calls + llvm::DenseSet m_outputCallLocations; // The output export calls' location ResourceUsage *m_resUsage; // Pointer to shader resource usage std::unique_ptr diff --git a/lgc/include/lgc/patch/WorkaroundDsSubdwordWrite.h b/lgc/include/lgc/patch/WorkaroundDsSubdwordWrite.h new file mode 100644 index 0000000000..7f75ecb6a5 --- /dev/null +++ b/lgc/include/lgc/patch/WorkaroundDsSubdwordWrite.h @@ -0,0 +1,59 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file WorkaroundDsSubdwordWrite.h + * @brief LLPC header file: contains declaration of class lgc::WorkaroundDsSubdwordWrite. + *********************************************************************************************************************** + */ +#pragma once + +#include "lgc/util/Internal.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" + +namespace lgc { + +// ===================================================================================================================== +// Represents the pass applying a sub-dword DS store workaround: +// +// - There is a bug (planned to be fixed) on gfx1150 with sub-dword writes +// to LDS. All sub-dword DS write ops are broken in the scenario when more +// than 1 thread of a wave32 has the same dword address, but different sub-dword +// address. Work around the issue by placing a waterfall loop +// around the ds_write, ensuring that the address written to is the same in +// all lanes. +// +// +class WorkaroundDsSubdwordWrite final : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); + + static llvm::StringRef name() { return "Patch LLVM for Temporary GFX1150 DS Sub-dword write bug"; } +}; + +} // namespace lgc diff --git a/lgc/include/lgc/state/AbiMetadata.h b/lgc/include/lgc/state/AbiMetadata.h index 017b3a7dc5..00a8142eba 100644 --- a/lgc/include/lgc/state/AbiMetadata.h +++ b/lgc/include/lgc/state/AbiMetadata.h @@ -993,7 +993,7 @@ typedef enum SWIZZLE_MODE_ENUM { SW_VAR_R__GFX10CORE = 0x0000000f, SW_VAR_S_X__GFX10CORE = 0x0000001d, SW_VAR_D_X__GFX10CORE = 0x0000001e, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 || CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 || CHIP_HDR_PHOENIX1 || CHIP_HDR_STRIX1 SW_256KB_Z__GFX11 = 0x0000000c, SW_256KB_S__GFX11 = 0x0000000d, SW_256KB_D__GFX11 = 0x0000000e, diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h index 018be0f05b..0515968f17 100644 --- a/lgc/include/lgc/state/PipelineState.h +++ b/lgc/include/lgc/state/PipelineState.h @@ -36,6 +36,7 @@ #include "lgc/state/ResourceUsage.h" #include "lgc/state/ShaderModes.h" #include "lgc/state/ShaderStage.h" +#include "lgc/util/BuilderBase.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PassManager.h" @@ -103,7 +104,6 @@ struct NggControl { // Represents transform feedback state metadata struct XfbStateMetadata { bool enableXfb; // Whether transform feedback is active - bool enablePrimStats; // Whether to count generated primitives std::array xfbStrides; // The strides of each XFB buffer std::array streamXfbBuffers; // The stream-out XFB buffers bit mask per stream std::array streamActive; // Flag indicating which vertex stream is active @@ -413,8 +413,7 @@ class PipelineState final : public Pipeline { bool enableXfb() const { return m_xfbStateMetadata.enableXfb; } // Check if we need count primitives if XFB is disabled - // NOTE: The old interface m_xfbStateMetadata.enablePrimStats will be removed later - bool enablePrimStats() const { return m_options.enablePrimGeneratedQuery || m_xfbStateMetadata.enablePrimStats; } + bool enablePrimStats() const { return m_options.enablePrimGeneratedQuery; } // Get transform feedback strides const std::array &getXfbBufferStrides() const { @@ -459,6 +458,7 @@ class PipelineState final : public Pipeline { // Get spill_threshold for a specific shader stage unsigned getSpillThreshold(ShaderStageEnum shaderStage) { return m_shaderSpillThreshold[shaderStage]; } + // ----------------------------------------------------------------------------------------------------------------- // Utility method templates to read and write IR metadata, used by PipelineState and ShaderModes diff --git a/lgc/include/lgc/state/ResourceUsage.h b/lgc/include/lgc/state/ResourceUsage.h index b4316ff5cc..5f617e8a99 100644 --- a/lgc/include/lgc/state/ResourceUsage.h +++ b/lgc/include/lgc/state/ResourceUsage.h @@ -176,6 +176,8 @@ struct ResourceUsage { struct { // Statement unsigned meshLinearDispatch : 1; // Mesh linear dispatch from task shader when group count Y/Z are both ones + // Workgroup layout + unsigned foldWorkgroupXY : 1; // The layout of the workgroup } task; // Vertex shader @@ -281,6 +283,7 @@ struct ResourceUsage { unsigned localInvocationIndex : 1; // Whether gl_LocalInvocationIndex is used unsigned subgroupId : 1; // Whether gl_SubgroupID is used unsigned numSubgroups : 1; // Whether gl_NumSubgroups is used + unsigned foldWorkgroupXY : 1; // Whether the workgroup is folded // Output unsigned pointSize : 1; // Whether gl_PointSize is used unsigned position : 1; // Whether gl_Position is used @@ -446,16 +449,17 @@ struct ResourceUsage { std::unordered_map> genericOutByteSizes[MaxGsStreams]; struct { - unsigned esGsRingItemSize; // Size of each vertex written to the ES -> GS Ring, in dwords. - unsigned gsVsRingItemSize; // Size of each primitive written to the GS -> VS Ring, in dwords. - unsigned esVertsPerSubgroup; // Number of vertices ES exports. - unsigned gsPrimsPerSubgroup; // Number of prims GS exports. - unsigned esGsLdsSize; // ES -> GS ring LDS size (GS in) - unsigned gsOnChipLdsSize; // Total LDS size for GS on-chip mode. - unsigned inputVertices; // Number of GS input vertices - unsigned primAmpFactor; // GS primitive amplification factor - bool enableMaxVertOut; // Whether to allow each GS instance to emit maximum vertices (NGG) - unsigned rayQueryLdsStackSize; // Ray query LDS stack size + unsigned esGsRingItemSize; // Size of each vertex written to the ES -> GS Ring, in dwords. + unsigned gsVsRingItemSize; // Size of each primitive written to the GS -> VS Ring, in dwords. + unsigned gsVsVertexItemSize[MaxGsStreams]; // Size of vertex item in the GS -> VS Ring, in dwords. + unsigned esVertsPerSubgroup; // Number of vertices ES exports. + unsigned gsPrimsPerSubgroup; // Number of prims GS exports. + unsigned esGsLdsSize; // ES -> GS ring LDS size (GS in) + unsigned gsOnChipLdsSize; // Total LDS size for GS on-chip mode. + unsigned inputVertices; // Number of GS input vertices + unsigned primAmpFactor; // GS primitive amplification factor + bool enableMaxVertOut; // Whether to allow each GS instance to emit maximum vertices (NGG) + unsigned rayQueryLdsStackSize; // Ray query LDS stack size } calcFactor = {}; unsigned outLocCount[MaxGsStreams] = {}; diff --git a/lgc/include/lgc/util/WorkgroupLayout.h b/lgc/include/lgc/util/WorkgroupLayout.h new file mode 100644 index 0000000000..f07c81e38b --- /dev/null +++ b/lgc/include/lgc/util/WorkgroupLayout.h @@ -0,0 +1,48 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file WorkgroupLayout.h + * @brief LLPC header file: Implementation of swizzle workgroup layout + *********************************************************************************************************************** + */ + +#pragma once + +#include "lgc/state/PipelineState.h" +#include "lgc/state/ResourceUsage.h" +#include "lgc/state/ShaderStage.h" +#include "lgc/util/BuilderBase.h" +#include "llvm/IR/IRBuilder.h" + +namespace lgc { + +SwizzleWorkgroupLayout calculateWorkgroupLayout(PipelineState *pipelineState, ShaderStageEnum shaderStage); + +llvm::Value *reconfigWorkgroupLayout(llvm::Value *localInvocationId, PipelineState *pipelineState, + ShaderStageEnum shaderStage, WorkgroupLayout macroLayout, + WorkgroupLayout microLayout, unsigned workgroupSizeX, unsigned workgroupSizeY, + unsigned workgroupSizeZ, bool isHwLocalInvocationId, BuilderBase &builder); +} // namespace lgc diff --git a/lgc/interface/lgc/BuilderCommon.h b/lgc/interface/lgc/BuilderCommon.h index 47d158b997..4efa6d7ebc 100644 --- a/lgc/interface/lgc/BuilderCommon.h +++ b/lgc/interface/lgc/BuilderCommon.h @@ -119,6 +119,9 @@ class BuilderCommon : public llvm_dialects::Builder { // Get the LGC type of a cooperative matrix with the given element type and layout. llvm::Type *getCooperativeMatrixTy(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout); + + // Whether the type of a cooperative matrix is specified bit width. + static bool isTypeNCooperativeMatrix(CooperativeMatrixElementType elemType, unsigned bitWidth); }; } // namespace lgc diff --git a/lgc/interface/lgc/BuiltIns.h b/lgc/interface/lgc/BuiltIns.h index 4ef272e9a3..1b39b6ec88 100644 --- a/lgc/interface/lgc/BuiltIns.h +++ b/lgc/interface/lgc/BuiltIns.h @@ -33,11 +33,11 @@ namespace lgc { -// Max spirv builtIn value +// Max SPIR-V built-in value static constexpr unsigned BuiltInInternalBase = 0x10000000; -// Max builtIn value for PS semantic (unsigned 16-bit) -static constexpr unsigned MaxBuiltInSemantic = 0x0000000F; +// Mask to indicate a SPIR-V built-in used in PS semantic (unsigned 16-bit) +static constexpr unsigned BuiltInSemanticMask = 0x8000; // Define built-in kind enum. enum BuiltInKind : unsigned { diff --git a/lgc/interface/lgc/LgcDialect.h b/lgc/interface/lgc/LgcDialect.h index 9dc36f4115..34b09b3a5b 100644 --- a/lgc/interface/lgc/LgcDialect.h +++ b/lgc/interface/lgc/LgcDialect.h @@ -47,6 +47,7 @@ enum class CooperativeMatrixElementType : unsigned { Int16, // 16-bit integer Int32, // 32 bit integer Float16Packed, // packed 16-bit floating-point + BFloat16, // 16-bit brain floating-point }; // Layout is virtual concept, eg: 16bit and 32bit for matrixC will share the same layout initially. diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h index fe382cf6f8..5f6e9ce437 100644 --- a/lgc/interface/lgc/Pipeline.h +++ b/lgc/interface/lgc/Pipeline.h @@ -100,6 +100,8 @@ enum class ThreadGroupSwizzleMode : unsigned { _4x4 = 1, // The tile size is 4x4 in x and y dimension. _8x8 = 2, // The tile size is 8x8 in x and y dimension. _16x16 = 3, // The tile size is 16x16 in x and y dimension. + _32x32 = 4, // The tile size is 32x32 in x and y dimension. + _64x64 = 5, // The tile size is 64x64 in x and y dimension. Count, }; @@ -127,7 +129,7 @@ static const char SampleShadingMetaName[] = "lgc.sample.shading"; // The front-end should zero-initialize a struct with "= {}" in case future changes add new fields. // Note: new fields must be added to the end of this structure to maintain test compatibility. union Options { - unsigned u32All[44]; + unsigned u32All[46]; struct { uint64_t hash[2]; // Pipeline hash to set in ELF PAL metadata unsigned includeDisassembly; // If set, the disassembly for all compiled shaders will be included @@ -197,7 +199,9 @@ union Options { unsigned reserved22; bool dynamicTopology; // Whether primitive topology is dynamic. bool reserved23; - bool forceUserDataSpill; // Whether to force all user data to be spilled (Currently only for RT). + bool forceUserDataSpill; // Whether to force all user data to be spilled (Currently only for RT). + bool enableMapClipDistMask; // For OGL only, whether to remap the clip distances. + unsigned clipPlaneMask; // For OGL only, defines the bitmask for enabling/disabling clip planes. }; }; static_assert(sizeof(Options) == sizeof(Options::u32All)); @@ -653,6 +657,12 @@ struct GeometryShaderMode { unsigned robustGsEmits; // robust buffer access }; +// Kind of derivativeMode: +// None: Return 0 for derivatives calculation of compute shader +// Linear: Calculating derivatives in linear mode(4*1) +// Quads: Calculating derivatives in Quads mode(2*2) +enum class DerivativeMode : unsigned { None, Linear, Quads }; + // Struct to pass to MeshShaderMode. The front-end should zero-initialize it with "= {}" in case // future changes add new fields. // All fields are unsigned, even those that could be bool, because the way the state is written to and read @@ -664,6 +674,7 @@ struct MeshShaderMode { unsigned workgroupSizeX; // X dimension of workgroup size. 0 is taken to be 1 unsigned workgroupSizeY; // Y dimension of workgroup size. 0 is taken to be 1 unsigned workgroupSizeZ; // Z dimension of workgroup size. 0 is taken to be 1 + DerivativeMode derivativeMode; // DerivativeMode for meshShader }; // Kind of conservative depth/stencil @@ -688,12 +699,6 @@ struct FragmentShaderMode { unsigned waveOpsRequireHelperLanes; }; -// Kind of derivativeMode: -// None: Return 0 for derivatives calculation of compute shader -// Linear: Calculating derivatives in linear mode(4*1) -// Quads: Calculating derivatives in Quads mode(2*2) -enum class DerivativeMode : unsigned { None, Linear, Quads }; - // Struct to pass to SetComputeShaderMode. // The front-end should zero-initialize it with "= {}" in case future changes add new fields. // All fields are unsigned, even those that could be bool, because the way the state is written to and read @@ -703,7 +708,7 @@ struct ComputeShaderMode { unsigned workgroupSizeY; // Y dimension of workgroup size. 0 is taken to be 1 unsigned workgroupSizeZ; // Z dimension of workgroup size. 0 is taken to be 1 unsigned subgroupSize; // Override for the wave size if it is non-zero - DerivativeMode derivatives; // derivativeMode for computeShader + DerivativeMode derivativeMode; // derivativeMode for computeShader unsigned noLocalInvocationIdInCalls; // For compute with calls, assume local invocation ID is never used in callees }; diff --git a/lgc/patch/Continufy.cpp b/lgc/patch/Continufy.cpp index cc9ef99f7b..795cac042c 100644 --- a/lgc/patch/Continufy.cpp +++ b/lgc/patch/Continufy.cpp @@ -35,6 +35,7 @@ #include "lgc/Builder.h" #include "lgc/LgcCpsDialect.h" #include "lgc/LgcDialect.h" +#include "lgc/LgcIlCpsDialect.h" #include "lgc/LgcRtDialect.h" #include "lgc/patch/Patch.h" #include "lgc/state/PalMetadata.h" @@ -178,24 +179,25 @@ PreservedAnalyses Continufy::run(Module &module, ModuleAnalysisManager &analysis } // Translate 'ret' into lgc.cps.jump for continufy stages. - if (!currentRtStage.has_value()) - continue; - // Skip the 'ret' in RGS. - if (currentRtStage.value() == (int32_t)RtStage::RayGeneration) - continue; Instruction *term = block.getTerminator(); if (auto *retInst = dyn_cast(term)) { builder.SetInsertPoint(term); - auto *retValue = retInst->getReturnValue(); - // %rcr, %shader-index - SmallVector tailArgs = {PoisonValue::get(builder.getInt32Ty()), - PoisonValue::get(builder.getInt32Ty())}; - // return value - if (retValue) - tailArgs.push_back(retValue); - - builder.create(fnPtr->getArg(1), getReturnedLevels(currentRtStage.value()), - PoisonValue::get(StructType::get(context, {})) /* state */, tailArgs); + + if (!currentRtStage.has_value() || currentRtStage.value() == (int32_t)RtStage::RayGeneration) { + builder.create(); + } else { + Value *poisonI32 = PoisonValue::get(builder.getInt32Ty()); + auto *retValue = retInst->getReturnValue(); + // %rcr, %shader-index + SmallVector tailArgs = {poisonI32}; + // return value + if (retValue) + tailArgs.push_back(retValue); + + builder.create(fnPtr->getArg(1), getReturnedLevels(currentRtStage.value()), + PoisonValue::get(StructType::get(context, {})) /* state */, poisonI32, tailArgs); + } + builder.CreateUnreachable(); term->eraseFromParent(); } diff --git a/lgc/patch/LowerCooperativeMatrix.cpp b/lgc/patch/LowerCooperativeMatrix.cpp index f5d20ea013..ad7b274447 100644 --- a/lgc/patch/LowerCooperativeMatrix.cpp +++ b/lgc/patch/LowerCooperativeMatrix.cpp @@ -122,6 +122,7 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties props.numMatrixElements = 8; props.numMatrixWords = 8; break; + case CooperativeMatrixElementType::BFloat16: case CooperativeMatrixElementType::Float16: case CooperativeMatrixElementType::Float16Packed: case CooperativeMatrixElementType::Int16: @@ -141,7 +142,7 @@ LowerCooperativeMatrix::TypeProperties LowerCooperativeMatrix::getTypeProperties assert(elemType != CooperativeMatrixElementType::Float32 && elemType != CooperativeMatrixElementType::Int32); props.numFlatElements = 16; } else if (layout == CooperativeMatrixLayout::AccumulatorMatrixLayout) { - if (elemType == CooperativeMatrixElementType::Float16 || elemType == CooperativeMatrixElementType::Int16) { + if (BuilderCommon::isTypeNCooperativeMatrix(elemType, 16)) { props.matrixElementStride = 2; } if (elemType == CooperativeMatrixElementType::Float16Packed) { @@ -526,9 +527,21 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvertInternal(CastInst::CastOp builder.SetInsertPoint(insertPos); Value *resultValue = nullptr; const unsigned vecSize = cast(source->getType())->getNumElements(); - Type *dstType = FixedVectorType::get(builder.transCooperativeMatrixElementType(dstElemType), vecSize); - if ((srcElemType == CooperativeMatrixElementType::Float16 || srcElemType == CooperativeMatrixElementType::Float32) && + Type *dstType = nullptr; + if (dstElemType == CooperativeMatrixElementType::BFloat16) + dstType = FixedVectorType::get(builder.getBFloatTy(), vecSize); + else + dstType = FixedVectorType::get(builder.transCooperativeMatrixElementType(dstElemType), vecSize); + + if (srcElemType == CooperativeMatrixElementType::BFloat16) { + assert(source->getType()->isIntOrIntVectorTy()); + auto *bfloat16Vec = FixedVectorType::get(builder.getBFloatTy(), vecSize); + source = builder.CreateBitCast(source, bfloat16Vec); + } + + if ((srcElemType == CooperativeMatrixElementType::Float16 || srcElemType == CooperativeMatrixElementType::BFloat16 || + srcElemType == CooperativeMatrixElementType::Float32) && (castOp == Instruction::FPToUI || castOp == Instruction::FPToSI)) { // FIXME: fp16's range is covered by i32. So `fptoi half` can convert // to i32 first following a sext/zext to target integer type. @@ -538,10 +551,20 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvertInternal(CastInst::CastOp if (builder.transCooperativeMatrixElementType(dstElemType)->getScalarSizeInBits() < 32) { resultValue = builder.CreateTrunc(resultValue, dstType); } + } else if (castOp == Instruction::FPTrunc && (srcElemType == CooperativeMatrixElementType::Float16 || + srcElemType == CooperativeMatrixElementType::BFloat16)) { + // Float16 -> BFloat16 or BFloat16 -> Float16 + resultValue = builder.CreateCast(Instruction::FPExt, source, FixedVectorType::get(builder.getFloatTy(), vecSize), + "Convert16tofloat32"); + resultValue = builder.CreateFPTrunc(resultValue, dstType); } else { resultValue = builder.CreateCast(castOp, source, dstType, "castOpConvert"); } + if (dstElemType == CooperativeMatrixElementType::BFloat16) { + resultValue = builder.CreateBitCast(resultValue, FixedVectorType::get(builder.getInt16Ty(), vecSize)); + } + return resultValue; } @@ -1391,38 +1414,54 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul Value *matrixD; unsigned waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage); - if (factorElemType == CooperativeMatrixElementType::Float16 || - factorElemType == CooperativeMatrixElementType::Int16) { + if (BuilderCommon::isTypeNCooperativeMatrix(factorElemType, 16)) { unsigned factorFlatElemNum = 0; { factorFlatElemNum = 16; } Type *factorType = FixedVectorType::get(builder.transCooperativeMatrixElementType(factorElemType), factorFlatElemNum); matrixA = builder.CreateBitCast(matrixA, factorType); matrixB = builder.CreateBitCast(matrixB, factorType); - } else if (factorElemType == CooperativeMatrixElementType::Int8) { + } else if (BuilderCommon::isTypeNCooperativeMatrix(factorElemType, 8)) { } else { llvm_unreachable("Factor element type is not supported!"); } - if (accumElemType == CooperativeMatrixElementType::Float32 || - accumElemType == CooperativeMatrixElementType::Int32) { + if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 32)) { matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef({0, 1, 2, 3}), "shuffleVector") : matrixC; - } else if (accumElemType == CooperativeMatrixElementType::Float16 || - accumElemType == CooperativeMatrixElementType::Int16) { + } else if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 16)) { { matrixC = waveSize == 64 ? builder.CreateShuffleVector(matrixC, ArrayRef({0, 1, 2, 3}), "shuffleVector") : matrixC; } unsigned matrixLength = cast(matrixC->getType())->getNumElements(); - Type *accumType = FixedVectorType::get(builder.getHalfTy(), matrixLength * 2); + + Type *castType = nullptr; + if (accumElemType == CooperativeMatrixElementType::BFloat16) { + // HW instructions require i16 type for bfloat16. + castType = builder.getInt16Ty(); + } else + castType = builder.getHalfTy(); + Type *accumType = FixedVectorType::get(castType, matrixLength * 2); matrixC = builder.CreateBitCast(matrixC, accumType); } else { llvm_unreachable("Accumulator element type is not supported!"); } - if (factorElemType == CooperativeMatrixElementType::Float16 && - accumElemType == CooperativeMatrixElementType::Float32) { + if (factorElemType == CooperativeMatrixElementType::BFloat16) { + Intrinsic::AMDGCNIntrinsics intrinsic = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16; + SmallVector args({matrixA, matrixB, matrixC}); + if (accumElemType == CooperativeMatrixElementType::Float32) + intrinsic = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16; + else { + assert(accumElemType == CooperativeMatrixElementType::BFloat16); + args.push_back(builder.getInt1(isSatOrOpsel)); + if (muladd.getIsTied()) + intrinsic = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied; + } + matrixD = builder.CreateIntrinsic(matrixC->getType(), intrinsic, args, nullptr, instName); + } else if (factorElemType == CooperativeMatrixElementType::Float16 && + accumElemType == CooperativeMatrixElementType::Float32) { matrixD = builder.CreateIntrinsic(matrixC->getType(), Intrinsic::amdgcn_wmma_f32_16x16x16_f16, {matrixA, matrixB, matrixC}, nullptr, instName); @@ -1451,8 +1490,7 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul llvm_unreachable("The accumulator type is not supported."); } - if (accumElemType == CooperativeMatrixElementType::Float16 || - accumElemType == CooperativeMatrixElementType::Int16) { + if (BuilderCommon::isTypeNCooperativeMatrix(accumElemType, 16)) { unsigned coopVeclength = cast(matrixD->getType())->getNumElements(); Type *wordTy = builder.transCooperativeMatrixElementType(accumElemType)->isIntOrIntVectorTy() ? builder.getInt32Ty() diff --git a/lgc/patch/LowerDebugPrintf.cpp b/lgc/patch/LowerDebugPrintf.cpp index ad8ab7bdae..98bfe7a2d4 100644 --- a/lgc/patch/LowerDebugPrintf.cpp +++ b/lgc/patch/LowerDebugPrintf.cpp @@ -74,10 +74,9 @@ PreservedAnalyses LowerDebugPrintf::run(Module &module, ModuleAnalysisManager &a if (printfFuncs.empty()) return PreservedAnalyses::all(); - bool hasPrintfDesc = - pipelineState - ->findResourceNode(ResourceNodeType::DescriptorBuffer, InternalDescriptorSetId, PrintfBufferBindingId) - .second != nullptr; + const ResourceNode *node = nullptr; + std::tie(m_topNode, node) = pipelineState->findResourceNode(ResourceNodeType::DescriptorBuffer, + InternalDescriptorSetId, PrintfBufferBindingId); static const auto lowerDebugfPrintOpVisitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) @@ -88,9 +87,10 @@ PreservedAnalyses LowerDebugPrintf::run(Module &module, ModuleAnalysisManager &a for (auto func : printfFuncs) { // Create printbuffer Descriptor at the beginning of the function which contains DebugPrintf dialect ops builder.SetInsertPointPastAllocas(func); - m_debugPrintfBuffer = hasPrintfDesc ? builder.CreateBufferDesc(InternalDescriptorSetId, PrintfBufferBindingId, - builder.getInt32(0), 2, true) - : nullptr; + m_debugPrintfBuffer = + (m_topNode != nullptr) + ? builder.CreateBufferDesc(InternalDescriptorSetId, PrintfBufferBindingId, builder.getInt32(0), 2, true) + : nullptr; lowerDebugfPrintOpVisitor.visit(*this, *func); } @@ -252,6 +252,7 @@ void LowerDebugPrintf::setupElfsPrintfStrings() { auto printfStrings = document->getRoot().getMap(true)[Util::Abi::PalCodeObjectMetadataKey::PrintfStrings].getMap(true); printfStrings[".version"] = 1; + printfStrings[".user_data_offset"] = m_topNode->offsetInDwords; auto formatStrings = printfStrings[".strings"].getArray(true); unsigned i = 0; for (auto it = m_elfInfos.begin(); it != m_elfInfos.end(); ++it, ++i) { diff --git a/lgc/patch/LowerGpuRt.cpp b/lgc/patch/LowerGpuRt.cpp index 7366a351b5..33a65e3fbd 100644 --- a/lgc/patch/LowerGpuRt.cpp +++ b/lgc/patch/LowerGpuRt.cpp @@ -82,6 +82,7 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi .add(&LowerGpuRt::visitWaveScanOp) .add(&LowerGpuRt::visitGetKnownSetRayFlagsOp) .add(&LowerGpuRt::visitGetKnownUnsetRayFlagsOp) + .add(&LowerGpuRt::visitInitStaticId) .build(); visitor.visit(*this, module); @@ -547,4 +548,15 @@ void LowerGpuRt::visitGetKnownUnsetRayFlagsOp(lgc::GpurtGetKnownUnsetRayFlagsOp m_funcsToLower.insert(inst.getCalledFunction()); } +// ===================================================================================================================== +// Visit "GpurtInitStaticIdOp" instruction +// +// @param inst : The dialect instruction to process +void LowerGpuRt::visitInitStaticId(lgc::GpurtInitStaticIdOp &inst) { + inst.replaceAllUsesWith(m_builder->getInt32( + llvm::hash_combine(m_pipelineState->getOptions().hash, inst.getModule()->getName(), m_rayStaticId++))); + m_callsToLower.push_back(&inst); + m_funcsToLower.insert(inst.getCalledFunction()); +} + } // namespace lgc diff --git a/lgc/patch/LowerPopsInterlock.cpp b/lgc/patch/LowerPopsInterlock.cpp new file mode 100644 index 0000000000..75eb39c7b5 --- /dev/null +++ b/lgc/patch/LowerPopsInterlock.cpp @@ -0,0 +1,387 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file LowerPopsInterlock.cpp + * @brief LGC source file: contains implementation of class lgc::LowerPopsInterlock. + *********************************************************************************************************************** + */ +#include "LowerPopsInterlock.h" +#include "lgc/state/PipelineState.h" +#include "lgc/state/TargetInfo.h" +#include "lgc/util/BuilderBase.h" +#include "llvm-dialects/Dialect/Visitor.h" +#include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +#define DEBUG_TYPE "lgc-lower-pops-interlock" + +using namespace llvm; + +namespace lgc { + +// ===================================================================================================================== +// Executes this LLVM patching pass on the specified LLVM module. +// +// @param [in/out] func : LLVM function to be run on +// @param [in/out] funcAnalysisManager : Analysis manager to use for this transformation +// @returns : The preserved analyses (The analyses that are still valid after this pass) +PreservedAnalyses LowerPopsInterlock::run(Function &func, FunctionAnalysisManager &funcAnalysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-POPS-Interlock\n"); + + // Not fragment shader, skip + if (getShaderStage(&func) != ShaderStageEnum::Fragment) + return PreservedAnalyses::all(); + + auto &moduleAnalysisManager = funcAnalysisManager.getResult(func); + m_pipelineState = moduleAnalysisManager.getCachedResult(*func.getParent())->getPipelineState(); + m_entryPoint = &func; + + BuilderBase builder(m_pipelineState->getContext()); + m_builder = &builder; + + legalizeInterlock(funcAnalysisManager); + lowerInterlock(); + + return m_changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +// ===================================================================================================================== +// Legalize POPS interlock operations. +// +// In this function, we try to collect all begin_interlock end_interlock operations and merge them to only one pair. +// Although GLSL spec says the two operations must be in main function without any control flow, we use them to support +// DX raster-order-view (ROV) feature. In such case, if we have multiple ROVs, each ROV can have a pair of +// begin/end_interlock to gate them and such pairs may be in conditional path of control flow. Our strategy to find the +// first use of ROVs and insert begin_interlock before it. If the insert block is in a cycle, we try to search up its +// ancestors until we find an appropriate insert point. Likewise, we insert end_interlock after the last use of ROVs. +// We search down descendants of the insert block if it is in a cycle. It is required by HW that the pair of +// begin/end_interlock can only be executed once for each wave. +// +// @funcAnalysisManager : Analysis manager to use for this transformation +void LowerPopsInterlock::legalizeInterlock(FunctionAnalysisManager &funcAnalysisManager) { + // + // Collect all begin_interlock and end_interlock operations for further analysis. + // + static auto visitor = llvm_dialects::VisitorBuilder() + .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) + .add(&LowerPopsInterlock::collectBeginInterlock) + .add(&LowerPopsInterlock::collectEndInterlock) + .build(); + visitor.visit(*this, *m_entryPoint); + + // Skip further processing if there are no POPS interlock operations + if (m_beginInterlocks.empty() && m_endInterlocks.empty()) + return; + + auto &domTree = funcAnalysisManager.getResult(*m_entryPoint); + auto &postDomTree = funcAnalysisManager.getResult(*m_entryPoint); + auto &cycleInfo = funcAnalysisManager.getResult(*m_entryPoint); + + // + // Legalize begin_interlock by doing two steps: + // 1. Find the closest common dominator of all begin_interlocks + // 2. If that is in a cycle, go up the dominator tree until it is not in a cycle. + // + assert(!m_beginInterlocks.empty()); // Must have at least one begin_interlock + auto nearestDom = m_beginInterlocks.front(); + for (unsigned i = 1; i < m_beginInterlocks.size(); ++i) + nearestDom = domTree.findNearestCommonDominator(nearestDom, m_beginInterlocks[i]); + + if (auto cycle = cycleInfo.getCycle(nearestDom->getParent())) + nearestDom = cycle->getCyclePredecessor()->getTerminator(); + + m_builder->SetInsertPoint(nearestDom); + m_builder->create(); + + // + // Legalize end_interlock by doing two steps: + // 1. Find the closest common dominator of all end_interlocks + // 2. If that is in a cycle, go down the dominator tree until it is not in a cycle. + // + assert(!m_endInterlocks.empty()); // Must have at least one end_interlock + auto nearestPostDom = m_endInterlocks.front(); + for (unsigned i = 1; i < m_endInterlocks.size(); ++i) { + const auto endInterlock = m_endInterlocks[i]; + if (endInterlock->getParent() == nearestPostDom->getParent()) { + // In the same block, maybe update nearest post dominator + if (nearestPostDom->comesBefore(endInterlock)) + nearestPostDom = endInterlock; + } else { + auto nearestPostDomBlock = + postDomTree.findNearestCommonDominator(nearestPostDom->getParent(), endInterlock->getParent()); + if (nearestPostDomBlock != nearestPostDom->getParent()) { + // Block of the nearest post dominator is changed, have to update nearest post dominator + if (nearestPostDomBlock == endInterlock->getParent()) { + // In the same block, use current end_interlock as the new nearest post dominator + nearestPostDom = endInterlock; + } else { + nearestPostDom = &*nearestPostDomBlock->getFirstInsertionPt(); + } + } + } + } + + while (auto cycle = cycleInfo.getCycle(nearestPostDom->getParent())) { + SmallVector succBlocks; + cycle->getExitBlocks(succBlocks); + nearestPostDom = &*succBlocks[0]->getFirstInsertionPt(); + }; + + m_builder->SetInsertPoint(nearestPostDom); + m_builder->create(); + + // + // Clean up + // + for (auto beginInterlock : m_beginInterlocks) { + beginInterlock->dropAllReferences(); + beginInterlock->eraseFromParent(); + } + m_beginInterlocks.clear(); + + for (auto endInterlock : m_endInterlocks) { + endInterlock->dropAllReferences(); + endInterlock->eraseFromParent(); + } + m_endInterlocks.clear(); + + m_changed = true; +} + +// ===================================================================================================================== +// Collect begin_interlock operations. +// +// @param popsBeginInterlockOp : Call instruction op to begin a POPS critical section +void LowerPopsInterlock::collectBeginInterlock(PopsBeginInterlockOp &popsBeginInterlockOp) { + m_beginInterlocks.push_back(&popsBeginInterlockOp); +} + +// ===================================================================================================================== +// Collect end_interlock operations. +// +// @param popsEndInterlockOp : Call instruction op to end a POPS critical section +void LowerPopsInterlock::collectEndInterlock(PopsEndInterlockOp &popsEndInterlockOp) { + m_endInterlocks.push_back(&popsEndInterlockOp); +} + +// ===================================================================================================================== +// Lower POPS interlock operations. +void LowerPopsInterlock::lowerInterlock() { + static auto visitor = llvm_dialects::VisitorBuilder() + .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) + .add(&LowerPopsInterlock::lowerBeginInterlock) + .add(&LowerPopsInterlock::lowerEndInterlock) + .build(); + visitor.visit(*this, *m_entryPoint); + + if (!m_beginInterlocks.empty()) { + assert(m_beginInterlocks.size() == 1); // Must have only one begin_interlock after legalization + m_beginInterlocks[0]->dropAllReferences(); + m_beginInterlocks[0]->eraseFromParent(); + m_beginInterlocks.clear(); + } + + if (!m_endInterlocks.empty()) { + assert(m_endInterlocks.size() == 1); // Must have only one end_interlock after legalization + m_endInterlocks[0]->dropAllReferences(); + m_endInterlocks[0]->eraseFromParent(); + m_endInterlocks.clear(); + } +} + +// ===================================================================================================================== +// Lower begin_interlock operation. +// +// @param popsBeginInterlockOp : Call instruction op to begin a POPS critical section +void LowerPopsInterlock::lowerBeginInterlock(PopsBeginInterlockOp &popsBeginInterlockOp) { + m_beginInterlocks.push_back(&popsBeginInterlockOp); + + m_builder->SetInsertPoint(&popsBeginInterlockOp); + + // + // The processing is something like this: + // + // Pre-GFX11: + // The layout of collision wave ID is as follow: + // + // +------------+-----------+---------------------------+-----------------+ + // | Overlapped | Packer ID | Newest Overlapped Wave ID | Current Wave ID | + // | [31] | [29:28] | [25:16] | [9:0] | + // +------------+-----------+---------------------------+-----------------+ + // + // POPS_BEGIN_INTERLOCK() { + // isOverlapped = collisionWaveId[31] + // if (isOverlapped) { + // packerId = collisionWaveId[29:28] + // s_setreg(HW_REG_POPS_PACKER, (packerId << 1) & 0x1)) + // + // currentWaveId = collisionWaveId[9:0] + // waveIdRemapOffset = -(currentWaveId + 1) = ~currentWaveId + // + // newestOverlappedWaveId = collisionWaveId[25:16] + // newestOverlappedWaveId += waveIdRemapOffset + // + // Load srcPopsExitingWaveId + // srcPopsExitingWaveId += waveIdRemapOffset + // while (srcPopsExitingWaveId <= newestOverlappedWaveId) { + // s_sleep(0xFFFF) + // Reload srcPopsExitingWaveId + // srcPopsExitingWaveId += waveIdRemapOffset + // } + // } + // } + // + // GFX11+: + // POPS_BEGIN_INTERLOCK() { + // s_wait_event(EXPORT_READY) + // } + // + const auto gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion(); + if (gfxIp.major >= 11) { + m_builder->CreateIntrinsic(m_builder->getVoidTy(), Intrinsic::amdgcn_s_wait_event_export_ready, {}); + return; + } + + auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs; + auto collisionWaveId = getFunctionArgument(m_entryPoint, entryArgIdxs.collisionWaveId); + + auto checkOverlapBlock = m_builder->GetInsertBlock(); + auto processOverlapBlock = checkOverlapBlock->splitBasicBlock(&popsBeginInterlockOp, ".processOverlap"); + auto waveWaitingHeaderBlock = processOverlapBlock->splitBasicBlock(&popsBeginInterlockOp, ".waveWaitingHeader"); + auto waveWaitingBodyBlock = waveWaitingHeaderBlock->splitBasicBlock(&popsBeginInterlockOp, ".waveWaitingBody"); + auto endProcessOverlapBlock = waveWaitingBodyBlock->splitBasicBlock(&popsBeginInterlockOp, ".endProcessOverlap"); + + // Modify ".checkOverlap" block + { + m_builder->SetInsertPoint(checkOverlapBlock->getTerminator()); + + auto isOverlapped = m_builder->CreateAnd(m_builder->CreateLShr(collisionWaveId, 31), 0x1); + isOverlapped = m_builder->CreateTrunc(isOverlapped, m_builder->getInt1Ty()); + m_builder->CreateCondBr(isOverlapped, processOverlapBlock, endProcessOverlapBlock); + + checkOverlapBlock->getTerminator()->eraseFromParent(); // Remove old terminator + } + + // Construct ".processOverlap" block + Value *waveIdRemapOffset = nullptr; + Value *newestOverlappedWaveId = nullptr; + { + m_builder->SetInsertPoint(processOverlapBlock->getTerminator()); + + auto packerId = m_builder->CreateAnd(m_builder->CreateLShr(collisionWaveId, 28), 0x3); + // POPS_PACKER: [0] Enable; [2:1] Packer ID + auto hwReg = [=](unsigned hwRegId, unsigned offset, unsigned size) { + // The HW register of s_setreg has this layout: + // [5:0] ID of HW register; [10:6] Offset; [15:11] Size + return ((hwRegId) | (offset << 6) | ((size - 1) << 11)); + }; + static const unsigned HwRegPopsPacker = 25; + auto popsPacker = m_builder->CreateOr(m_builder->CreateShl(packerId, 1), 0x1); + m_builder->CreateIntrinsic(m_builder->getVoidTy(), Intrinsic::amdgcn_s_setreg, + {m_builder->getInt32(hwReg(HwRegPopsPacker, 0, 3)), popsPacker}); + + // waveIdRemapOffset = -(currentWaveId + 1) = ~currentWaveId + auto currentWaveId = m_builder->CreateAnd(collisionWaveId, 0x3FF); + waveIdRemapOffset = m_builder->CreateNot(currentWaveId); + + // newestOverlappedWaveId += waveIdRemapOffset + newestOverlappedWaveId = m_builder->CreateAnd(m_builder->CreateLShr(collisionWaveId, 16), 0x3FF); + newestOverlappedWaveId = m_builder->CreateAdd(newestOverlappedWaveId, waveIdRemapOffset); + } + + // Construct ".waveWaitingHeader" block + { + m_builder->SetInsertPoint(waveWaitingHeaderBlock->getTerminator()); + + Value *popsExitingWaveId = + m_builder->CreateIntrinsic(m_builder->getInt32Ty(), Intrinsic::amdgcn_pops_exiting_wave_id, {}); + popsExitingWaveId = m_builder->CreateAdd(popsExitingWaveId, waveIdRemapOffset); + + Value *needToWait = m_builder->CreateICmpULE(popsExitingWaveId, newestOverlappedWaveId); + m_builder->CreateCondBr(needToWait, waveWaitingBodyBlock, endProcessOverlapBlock); + + waveWaitingHeaderBlock->getTerminator()->eraseFromParent(); // Remove old terminator + } + + // Construct ".waveWaitingBody" block + { + m_builder->SetInsertPoint(waveWaitingBodyBlock->getTerminator()); + + static const unsigned WaitTime = 0xFFFF; + m_builder->CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {}, m_builder->getInt32(WaitTime)); + + m_builder->CreateBr(waveWaitingHeaderBlock); + + waveWaitingBodyBlock->getTerminator()->eraseFromParent(); // Remove old terminator + } + + // Currently, nothing to do to construct ".endProcessOverlap" block + + m_changed = true; +} + +// ===================================================================================================================== +// Lower end_interlock operation. +// +// @param popsEndInterlockOp : Call instruction op to end a POPS critical section +void LowerPopsInterlock::lowerEndInterlock(PopsEndInterlockOp &popsEndInterlockOp) { + m_endInterlocks.push_back(&popsEndInterlockOp); + + m_builder->SetInsertPoint(&popsEndInterlockOp); + + // + // The processing is something like this: + // + // Pre-GFX11: + // POPS_END_INTERLOCK() { + // s_wait_vscnt null, 0x0 + // s_sendmsg(MSG_ORDERED_PS_DONE) + // } + // + // GFX11+: + // POPS_END_INTERLOCK() { + // s_wait_vscnt null, 0x0 + // } + // + + // Add s_wait_vscnt null, 0x0 to make sure the completion of all writes + SyncScope::ID syncScope = m_builder->getContext().getOrInsertSyncScopeID("agent"); + m_builder->CreateFence(AtomicOrdering::Release, syncScope); + + const auto gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion(); + if (gfxIp.major < 11) { + auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs; + auto primMask = getFunctionArgument(m_entryPoint, entryArgIdxs.primMask); + + m_builder->CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {m_builder->getInt32(OrderedPsDone), primMask}); + } + + m_changed = true; +} + +} // namespace lgc diff --git a/lgc/patch/LowerPopsInterlock.h b/lgc/patch/LowerPopsInterlock.h new file mode 100644 index 0000000000..0e26d4fb89 --- /dev/null +++ b/lgc/patch/LowerPopsInterlock.h @@ -0,0 +1,68 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file LowerPopsInterlock.h + * @brief LGC header file: contains declaration of lgc::LowerPopsInterlock + *********************************************************************************************************************** + */ +#pragma once + +#include "lgc/LgcDialect.h" +#include "llvm/IR/PassManager.h" + +namespace lgc { + +class BuilderBase; +class PipelineState; + +class LowerPopsInterlock : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Function &func, llvm::FunctionAnalysisManager &funcAnalysisManager); + + static llvm::StringRef name() { return "Lower POPS interlock operations"; } + +private: + void legalizeInterlock(llvm::FunctionAnalysisManager &funcAnalysisManager); + void collectBeginInterlock(PopsBeginInterlockOp &popsBeginInterlockOp); + void collectEndInterlock(PopsEndInterlockOp &popsEndInterlockOp); + + void lowerInterlock(); + void lowerBeginInterlock(PopsBeginInterlockOp &popsBeginInterlockOp); + void lowerEndInterlock(PopsEndInterlockOp &popsEndInterlockOp); + + PipelineState *m_pipelineState = nullptr; // Pipeline state + llvm::Function *m_entryPoint = nullptr; // Entry-point of fragment shader + + BuilderBase *m_builder = nullptr; // LLVM IR builder + + // List of POPS interlock operations + llvm::SmallVector m_beginInterlocks; + llvm::SmallVector m_endInterlocks; + + bool m_changed = false; // Whether the IR is changed by this pass +}; + +} // namespace lgc diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp index a693be432e..0fcd139544 100644 --- a/lgc/patch/MeshTaskShader.cpp +++ b/lgc/patch/MeshTaskShader.cpp @@ -32,6 +32,7 @@ #include "ShaderMerger.h" #include "lgc/patch/Patch.h" #include "lgc/util/Debug.h" +#include "lgc/util/WorkgroupLayout.h" #include "llvm-dialects/Dialect/Visitor.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" @@ -307,8 +308,18 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct assert(outputsLayout); outputsLayout->primitiveStride = primitiveStride; + bool hasDummyVertexAttrib = false; + if (!pipelineState->exportAttributeByExportInstruction()) { + if (outputsLayout->vertexExportCount == 0) { + // NOTE: HW allocates and manages attribute ring based on the register fields: VS_EXPORT_COUNT and + // PRIM_EXPORT_COUNT. When VS_EXPORT_COUNT = 0, HW assumes there is still a vertex attribute exported even + // though this is not what we want. Hence, we should reserve param0 as a dummy vertex attribute. + hasDummyVertexAttrib = true; + } + } + unsigned offsetInPrimitive = 0; - const unsigned startSlot = outputsLayout->vertexExportCount; + const unsigned startSlot = hasDummyVertexAttrib ? 1 : outputsLayout->vertexExportCount; unsigned exportSlot = startSlot; unsigned exportCount = 0; @@ -1513,7 +1524,19 @@ void MeshTaskShader::lowerGetMeshBuiltinInput(GetMeshBuiltinInputOp &getMeshBuil break; } case BuiltInLocalInvocationId: { - input = getMeshLocalInvocationId(); + // Insert a call that later on might get lowered to code to reconfigure the workgroup. + auto &mode = m_pipelineState->getShaderModes()->getMeshShaderMode(); + unsigned workgroupSizeX = mode.workgroupSizeX; + unsigned workgroupSizeY = mode.workgroupSizeY; + unsigned workgroupSizeZ = mode.workgroupSizeZ; + SwizzleWorkgroupLayout layout = calculateWorkgroupLayout(m_pipelineState, ShaderStage::Mesh); + if ((layout.microLayout == WorkgroupLayout::Quads) || (layout.macroLayout == WorkgroupLayout::SexagintiQuads)) { + input = getMeshLocalInvocationId(true /* foldXY = true */); + input = reconfigWorkgroupLayout(input, m_pipelineState, ShaderStage::Mesh, layout.macroLayout, layout.microLayout, + workgroupSizeX, workgroupSizeY, workgroupSizeZ, false, m_builder); + } else { + input = getMeshLocalInvocationId(); + } break; } case BuiltInGlobalInvocationId: { @@ -2510,15 +2533,7 @@ void MeshTaskShader::doExport(ExportKind kind, ArrayRef exports) { // ringOffset = attribRingBaseOffset + 32 * exportSlot * 16 // = attribRingBaseOffset + exportSlot * 512 - unsigned exportSlot = exports[i].slot; - if (kind == ExportKind::PrimAttr && m_hasNoVertexAttrib) { - // NOTE: HW allocates and manages attribute ring based on the register fields: VS_EXPORT_COUNT and - // PRIM_EXPORT_COUNT. When VS_EXPORT_COUNT = 0, HW assumes there is still a vertex attribute exported even - // though this is not what we want. Hence, we should reserve param0 as a dummy vertex attribute and all - // primitive attributes are moved after it. - ++exportSlot; - } - auto locationOffset = m_builder.getInt32(exportSlot * SizeOfVec4); + auto locationOffset = m_builder.getInt32(exports[i].slot * SizeOfVec4); CoherentFlag coherent = {}; if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) { @@ -2558,10 +2573,8 @@ void MeshTaskShader::prepareAttribRingAccess() { // NOTE: HW allocates and manages attribute ring based on the register fields: VS_EXPORT_COUNT and PRIM_EXPORT_COUNT. // When VS_EXPORT_COUNT = 0, HW assumes there is still a vertex attribute exported even though this is not what we // want. Hence, we should reserve param0 as a dummy vertex attribute. - if (m_outputsLayout.vertexExportCount == 0) { - m_hasNoVertexAttrib = true; + if (m_outputsLayout.vertexExportCount == 0) ++attribCount; // Count in this dummy vertex attribute - } // attribRingBase[14:0] auto entryPoint = m_builder.GetInsertBlock()->getParent(); @@ -2688,8 +2701,9 @@ Value *MeshTaskShader::getMeshWorkgroupId() { // ===================================================================================================================== // Get the built-in LocalInvocationId of mesh shader. // +// @param foldXY : Specify whether the locationId.x and locationId.y should be folded. // @returns : Value of the built-in LocalInvocationId -Value *MeshTaskShader::getMeshLocalInvocationId() { +Value *MeshTaskShader::getMeshLocalInvocationId(bool foldXY) { auto entryPoint = m_builder.GetInsertBlock()->getParent(); assert(getShaderStage(entryPoint) == ShaderStage::Mesh); // Must be mesh shader @@ -2734,8 +2748,18 @@ Value *MeshTaskShader::getMeshLocalInvocationId() { diff = m_builder.CreateSub(localInvocationIndex, diff); localInvocationIdY = m_builder.CreateUDiv(diff, workgroupSizeX, "localInvocationIdY"); - localInvocationIdX = m_builder.CreateMul(workgroupSizeX, localInvocationIdY); - localInvocationIdX = m_builder.CreateSub(diff, localInvocationIdX, "localInvocationIdX"); + if (foldXY) { + localInvocationIdX = diff; + // Unused dimensions need zero-initializing. + if (meshMode.workgroupSizeZ <= 1) { + if (meshMode.workgroupSizeY <= 1) + localInvocationIdY = m_builder.getInt32(0); + localInvocationIdZ = m_builder.getInt32(0); + } + } else { + localInvocationIdX = m_builder.CreateMul(workgroupSizeX, localInvocationIdY); + localInvocationIdX = m_builder.CreateSub(diff, localInvocationIdX, "localInvocationIdX"); + } } Value *localInvocationId = PoisonValue::get(FixedVectorType::get(m_builder.getInt32Ty(), 3)); @@ -2962,17 +2986,20 @@ void MeshTaskShader::updateMeshShaderInOutUsage() { inOutUsage.primExpCount = m_outputsLayout.primitiveExportCount; // For part pipeline, below info will be used to build the metadata ".preraster_output_semantic" to correctly map - // output locations specified by API mesh shader to HW export slots. The export slots will be used to fill the - // register field SPI_PS_INPUT_CNTL.OFFSET during pipeline linking. + // output semantic locations specified by API mesh shader to HW export slots. The export slots will be used to fill + // the register field SPI_PS_INPUT_CNTL.OFFSET during pipeline linking. if (m_pipelineState->isUnlinked()) { - inOutUsage.outputLocInfoMap.clear(); - for (auto &genericExport : m_outputsLayout.vertexGenericExports) { - const auto &[location, exportSlot] = genericExport; - InOutLocationInfo locInfo = {}; - locInfo.setLocation(location); - InOutLocationInfo newLocInfo = {}; - newLocInfo.setLocation(exportSlot); - inOutUsage.outputLocInfoMap[locInfo] = newLocInfo; + for (auto it = inOutUsage.outputLocInfoMap.begin(); it != inOutUsage.outputLocInfoMap.end();) { + // Revisit each entry of vertex outputs. If it is recorded and processed by mesh shader, update the mapping + // location to HW export slot. Otherwise, remove this entry. + const unsigned mappingLocation = it->second.getLocation(); + if (m_outputsLayout.vertexGenericExports.count(mappingLocation) > 0) { + const unsigned exportSlot = m_outputsLayout.vertexGenericExports[mappingLocation]; + it->second.setLocation(exportSlot); + it++; + } else { + inOutUsage.outputLocInfoMap.erase(it++); + } } inOutUsage.builtInOutputLocMap.clear(); @@ -2981,10 +3008,17 @@ void MeshTaskShader::updateMeshShaderInOutUsage() { inOutUsage.builtInOutputLocMap[builtIn] = exportSlot; } - inOutUsage.perPrimitiveOutputLocMap.clear(); - for (auto &genericExport : m_outputsLayout.primitiveGenericExports) { - const auto &[location, exportSlot] = genericExport; - inOutUsage.perPrimitiveOutputLocMap[location] = exportSlot; + for (auto it = inOutUsage.perPrimitiveOutputLocMap.begin(); it != inOutUsage.perPrimitiveOutputLocMap.end();) { + // Revisit each entry of primitive outputs. If it is recorded and processed by mesh shader, update the mapping + // location to HW export slot. Otherwise, remove this entry. + const unsigned mappingLocation = it->second; + if (m_outputsLayout.primitiveGenericExports.count(mappingLocation) > 0) { + const unsigned exportSlot = m_outputsLayout.primitiveGenericExports[mappingLocation]; + it->second = exportSlot; + it++; + } else { + inOutUsage.perPrimitiveOutputLocMap.erase(it++); + } } inOutUsage.perPrimitiveBuiltInOutputLocMap.clear(); diff --git a/lgc/patch/MeshTaskShader.h b/lgc/patch/MeshTaskShader.h index 27204e5617..84ed5b1b12 100644 --- a/lgc/patch/MeshTaskShader.h +++ b/lgc/patch/MeshTaskShader.h @@ -99,7 +99,6 @@ class MeshTaskShader { void processTaskShader(llvm::Function *entryPoint); void processMeshShader(llvm::Function *entryPoint); - void lowerTaskPayloadPtr(TaskPayloadPtrOp &taskPayloadPtrOp); void lowerEmitMeshTasks(EmitMeshTasksOp &emitMeshTasksOp); void lowerSetMeshOutputs(SetMeshOutputsOp &setMeshOutputsOp); @@ -143,7 +142,7 @@ class MeshTaskShader { llvm::Value *getMeshFlatWorkgroupId(); llvm::Value *getMeshNumWorkgroups(); llvm::Value *getMeshWorkgroupId(); - llvm::Value *getMeshLocalInvocationId(); + llvm::Value *getMeshLocalInvocationId(bool foldXY = false); llvm::Value *getMeshLocalInvocationIndex(); llvm::Value *getMeshGlobalInvocationId(); @@ -224,7 +223,6 @@ class MeshTaskShader { llvm::Value *m_shaderRingEntryIndex = nullptr; // Shader ring entry index of current workgroup llvm::Value *m_payloadRingEntryOffset = nullptr; // Entry offset (in bytes) of the payload ring - bool m_hasNoVertexAttrib = false; // Whether mesh shader has vertex attribute export or not llvm::Value *m_attribRingBufDesc = nullptr; // Attribute ring buffer descriptor llvm::Value *m_attribRingBaseOffset = nullptr; // Subgroup's attribute ring base offset (in bytes) diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp index de17aef8ac..a1e25e904d 100644 --- a/lgc/patch/NggPrimShader.cpp +++ b/lgc/patch/NggPrimShader.cpp @@ -132,7 +132,7 @@ NggPrimShader::NggPrimShader(PipelineState *pipelineState) unsigned vertexItemSizes[MaxGsStreams] = {}; auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry); for (unsigned i = 0; i < MaxGsStreams; ++i) - vertexItemSizes[i] = 4 * resUsage->inOutUsage.gs.outLocCount[i]; + vertexItemSizes[i] = resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i]; unsigned gsVsRingItemSizes[MaxGsStreams] = {}; const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode(); @@ -3126,8 +3126,8 @@ void NggPrimShader::runEs(ArrayRef args) { if (m_hasGs) { auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor; unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Geometry); - unsigned esGsBytesPerWave = waveSize * sizeof(unsigned) * calcFactor.esGsRingItemSize; - esGsOffset = m_builder.CreateMul(m_nggInputs.waveIdInSubgroup, m_builder.getInt32(esGsBytesPerWave)); + esGsOffset = + m_builder.CreateMul(m_nggInputs.waveIdInSubgroup, m_builder.getInt32(waveSize * calcFactor.esGsRingItemSize)); } Value *offChipLdsBase = args[ShaderMerger::getSpecialSgprInputIndex(m_gfxIp, EsGs::OffChipLdsBase)]; @@ -7831,7 +7831,8 @@ Value *NggPrimShader::calcVertexItemOffset(unsigned streamId, Value *vertexIndex auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage; // vertexOffset = gsVsRingStart + streamBases[stream] + vertexIndex * vertexItemSize (in dwords) - const unsigned vertexItemSize = 4 * inOutUsage.gs.outLocCount[streamId]; + const unsigned vertexItemSize = inOutUsage.gs.calcFactor.gsVsVertexItemSize[streamId]; + auto vertexOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(vertexItemSize)); vertexOffset = m_builder.CreateAdd(vertexOffset, m_builder.getInt32(m_gsStreamBases[streamId])); diff --git a/lgc/patch/PassRegistry.inc b/lgc/patch/PassRegistry.inc index ecf66e9967..c2018e9f34 100644 --- a/lgc/patch/PassRegistry.inc +++ b/lgc/patch/PassRegistry.inc @@ -82,6 +82,10 @@ LLPC_MODULE_PASS("lgc-frag-color-export", LowerFragColorExport) LLPC_MODULE_PASS("lgc-lower-debug-printf", LowerDebugPrintf) LLPC_MODULE_PASS("lgc-lower-desc", LowerDesc) +#if LLPC_BUILD_STRIX1 +LLPC_MODULE_PASS("lgc-workaround-ds-subdword-write", WorkaroundDsSubdwordWrite) +#endif + LLPC_FUNCTION_PASS("lgc-combine-cooperative-matrix", CombineCooperativeMatrix) LLPC_MODULE_PASS("lgc-lower-cooperative-matrix", LowerCooperativeMatrix) LLPC_MODULE_PASS("lgc-lower-gpurt", LowerGpuRt) diff --git a/lgc/patch/Patch.cpp b/lgc/patch/Patch.cpp index 746bf9de2c..fbb01c5c57 100644 --- a/lgc/patch/Patch.cpp +++ b/lgc/patch/Patch.cpp @@ -29,6 +29,7 @@ *********************************************************************************************************************** */ #include "lgc/patch/Patch.h" +#include "LowerPopsInterlock.h" #include "LowerRayQueryWrapper.h" #include "PatchNullFragShader.h" #include "llvmraytracing/Continuations.h" @@ -36,10 +37,8 @@ #include "lgc/PassManager.h" #include "lgc/Pipeline.h" #include "lgc/builder/BuilderReplayer.h" -#include "lgc/patch/CombineCooperativeMatrix.h" #include "lgc/patch/Continufy.h" #include "lgc/patch/FragColorExport.h" -#include "lgc/patch/LowerCooperativeMatrix.h" #include "lgc/patch/LowerDebugPrintf.h" #include "lgc/patch/LowerDesc.h" #include "lgc/patch/LowerGpuRt.h" @@ -65,6 +64,11 @@ #include "lgc/patch/PatchWorkarounds.h" #include "lgc/patch/TcsPassthroughShader.h" #include "lgc/patch/VertexFetch.h" +#if LLPC_BUILD_STRIX1 +#include "lgc/patch/WorkaroundDsSubdwordWrite.h" +#endif +#include "lgc/patch/CombineCooperativeMatrix.h" +#include "lgc/patch/LowerCooperativeMatrix.h" #include "lgc/state/AbiMetadata.h" #include "lgc/state/PipelineState.h" #include "lgc/state/TargetInfo.h" @@ -207,6 +211,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T passMgr.addPass(LowerDebugPrintf()); passMgr.addPass(LowerDesc()); passMgr.addPass(PatchEntryPointMutate()); + passMgr.addPass(createModuleToFunctionPassAdaptor(LowerPopsInterlock())); passMgr.addPass(PatchInitializeWorkgroupMemory()); passMgr.addPass(PatchInOutImportExport()); @@ -214,6 +219,10 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T passMgr.addPass(createModuleToFunctionPassAdaptor(PatchInvariantLoads())); passMgr.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(PatchLoopMetadata()))); +#if LLPC_BUILD_STRIX1 + passMgr.addPass(WorkaroundDsSubdwordWrite()); +#endif + if (patchTimer) { LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, false); LgcContext::createAndAddStartStopTimer(passMgr, optTimer, true); diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp index 6c3044f299..c6948a2258 100644 --- a/lgc/patch/PatchBufferOp.cpp +++ b/lgc/patch/PatchBufferOp.cpp @@ -408,8 +408,12 @@ void BufferOpLowering::visitAtomicCmpXchgInst(AtomicCmpXchgInst &atomicCmpXchgIn copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst); return newAtomicCmpXchg; }; + // The index should be used when a strided pointer is converted to offset mode. + Value *index = nullptr; + if (atomicCmpXchgInst.getPointerOperand()->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) + index = values[2]; Value *result = - createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicCmpXchgInst, createAtomicCmpXchgFunc); + createGlobalPointerAccess(bufferDesc, baseIndex, index, storeType, atomicCmpXchgInst, createAtomicCmpXchgFunc); // Record the atomic instruction so we remember to delete it later. m_typeLowering.eraseInstruction(&atomicCmpXchgInst); @@ -502,7 +506,12 @@ void BufferOpLowering::visitAtomicRMWInst(AtomicRMWInst &atomicRmwInst) { copyMetadata(newAtomicRmw, &atomicRmwInst); return newAtomicRmw; }; - Value *result = createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicRmwInst, createAtomicRmwFunc); + // The index should be used when a strided pointer is converted to offset mode. + Value *index = nullptr; + if (atomicRmwInst.getPointerOperand()->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) + index = values[2]; + Value *result = + createGlobalPointerAccess(bufferDesc, baseIndex, index, storeType, atomicRmwInst, createAtomicRmwFunc); // Record the atomic instruction so we remember to delete it later. m_typeLowering.eraseInstruction(&atomicRmwInst); @@ -1517,7 +1526,11 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { } return result; }; - return createGlobalPointerAccess(bufferDesc, baseIndex, type, inst, createLoadStoreFunc); + // The index should be used when a strided pointer is converted to offset mode. + Value *index = nullptr; + if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) + index = pointerValues[2]; + return createGlobalPointerAccess(bufferDesc, baseIndex, index, type, inst, createLoadStoreFunc); } switch (ordering) { @@ -1802,14 +1815,27 @@ Instruction *BufferOpLowering::makeLoop(Value *const loopStart, Value *const loo // // @param bufferDesc: The buffer descriptor // @param offset: The offset on the global memory +// @param strideIndex: The index of strided load // @param type: The accessed data type // @param inst: The instruction to be executed on the buffer // @param callback: The callback function to perform the specific global access -Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Value *const offset, Type *const type, - Instruction &inst, const function_ref callback) { +Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Value *const offset, + Value *const strideIndex, Type *const type, Instruction &inst, + const function_ref callback) { // The 2nd element (NUM_RECORDS) in the buffer descriptor is byte bound. Value *bound = m_builder.CreateExtractElement(bufferDesc, 2); - Value *inBound = m_builder.CreateICmpULT(offset, bound); + Value *newOffset = offset; + + // index is for strided load which we need to handle the stride of the SRD. + if (strideIndex) { + Value *desc1 = m_builder.CreateExtractElement(bufferDesc, 1); + Value *stride = + m_builder.CreateAnd(m_builder.CreateLShr(desc1, m_builder.getInt32(16)), m_builder.getInt32(0x3fff)); + bound = m_builder.CreateMul(bound, stride); + newOffset = m_builder.CreateAdd(m_builder.CreateMul(strideIndex, stride), newOffset); + } + + Value *inBound = m_builder.CreateICmpULT(newOffset, bound); // If null descriptor or extended robust buffer access is allowed, we will create a branch to perform normal global // access based on the valid check. @@ -1831,15 +1857,10 @@ Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Valu } // Global pointer access Value *baseAddr = getBaseAddressFromBufferDesc(bufferDesc); - Value *newOffset = nullptr; - if (m_pipelineState.getOptions().enableExtendedRobustBufferAccess) { - // No need to check out-of-bind if the extended robustness check is already done - newOffset = offset; - } else { - // NOTE: The offset of out-of-bound overridden as 0 may causes unexpected result when the extended robustness access - // is disabled. - newOffset = m_builder.CreateSelect(inBound, offset, m_builder.getInt32(0)); - } + // NOTE: The offset of out-of-bound overridden as 0 may cause unexpected result when the extended robustness access + // is disabled. + if (!m_pipelineState.getOptions().enableExtendedRobustBufferAccess) + newOffset = m_builder.CreateSelect(inBound, newOffset, m_builder.getInt32(0)); // Add on the index to the address. Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newOffset); diff --git a/lgc/patch/PatchEntryPointMutate.cpp b/lgc/patch/PatchEntryPointMutate.cpp index c5e357af0e..2e0808f118 100644 --- a/lgc/patch/PatchEntryPointMutate.cpp +++ b/lgc/patch/PatchEntryPointMutate.cpp @@ -165,7 +165,6 @@ PreservedAnalyses PatchEntryPointMutate::run(Module &module, ModuleAnalysisManag processGroupMemcpy(module); processDriverTableLoad(module); - processPops(module); return PreservedAnalyses::none(); } @@ -509,204 +508,6 @@ void PatchEntryPointMutate::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) { } } -// ===================================================================================================================== -// Process PopsBeginCriticalSectionOp and PopsEndCriticalSectionOp. -// -// @param module : LLVM module -void PatchEntryPointMutate::processPops(llvm::Module &module) { - SmallVector callsToRemove; - - struct Payload { - SmallVectorImpl &callsToRemove; - PatchEntryPointMutate *self; - }; - - Payload payload = {callsToRemove, this}; - static auto visitor = llvm_dialects::VisitorBuilder() - .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) - .add([](auto &payload, auto &op) { - payload.self->lowerPopsBeginInterlock(op); - payload.callsToRemove.push_back(&op); - }) - .add([](auto &payload, auto &op) { - payload.self->lowerPopsEndInterlock(op); - payload.callsToRemove.push_back(&op); - }) - .build(); - visitor.visit(payload, module); - - for (auto call : payload.callsToRemove) - call->eraseFromParent(); -} - -// ===================================================================================================================== -// Lower PopsBeginInterlockOp. -// -// @param popsBeginInterlockOp : Call instruction op to begin a POPS critical section -void PatchEntryPointMutate::lowerPopsBeginInterlock(PopsBeginInterlockOp &popsBeginInterlockOp) { - Function *entryPoint = popsBeginInterlockOp.getFunction(); - assert(getShaderStage(entryPoint) == ShaderStage::Fragment); // Must be FS - - BuilderBase builder(&popsBeginInterlockOp); - - // - // The processing is something like this: - // - // Pre-GFX11: - // The layout of collision wave ID is as follow: - // - // +------------+-----------+---------------------------+-----------------+ - // | Overlapped | Packer ID | Newest Overlapped Wave ID | Current Wave ID | - // | [31] | [29:28] | [25:16] | [9:0] | - // +------------+-----------+---------------------------+-----------------+ - // - // POPS_BEGIN_INTERLOCK() { - // isOverlapped = collisionWaveId[31] - // if (isOverlapped) { - // packerId = collisionWaveId[29:28] - // s_setreg(HW_REG_POPS_PACKER, (packerId << 1) & 0x1)) - // - // currentWaveId = collisionWaveId[9:0] - // waveIdRemapOffset = -(currentWaveId + 1) = ~currentWaveId - // - // newestOverlappedWaveId = collisionWaveId[25:16] - // newestOverlappedWaveId += waveIdRemapOffset - // - // Load srcPopsExitingWaveId - // srcPopsExitingWaveId += waveIdRemapOffset - // while (srcPopsExitingWaveId <= newestOverlappedWaveId) { - // s_sleep(0xFFFF) - // Reload srcPopsExitingWaveId - // srcPopsExitingWaveId += waveIdRemapOffset - // } - // } - // } - // - // GFX11+: - // POPS_BEGIN_INTERLOCK() { - // s_wait_event(EXPORT_READY) - // } - // - auto gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion(); - if (gfxIp.major >= 11) { - builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_s_wait_event_export_ready, {}); - return; - } - - auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs; - auto collisionWaveId = getFunctionArgument(entryPoint, entryArgIdxs.collisionWaveId); - - auto checkOverlapBlock = builder.GetInsertBlock(); - auto processOverlapBlock = checkOverlapBlock->splitBasicBlock(&popsBeginInterlockOp, ".processOverlap"); - auto waveWaitingHeaderBlock = processOverlapBlock->splitBasicBlock(&popsBeginInterlockOp, ".waveWaitingHeader"); - auto waveWaitingBodyBlock = waveWaitingHeaderBlock->splitBasicBlock(&popsBeginInterlockOp, ".waveWaitingBody"); - auto endProcessOverlapBlock = waveWaitingBodyBlock->splitBasicBlock(&popsBeginInterlockOp, ".endProcessOverlap"); - - // Modify ".checkOverlap" block - { - builder.SetInsertPoint(checkOverlapBlock->getTerminator()); - - auto isOverlapped = builder.CreateAnd(builder.CreateLShr(collisionWaveId, 31), 0x1); - isOverlapped = builder.CreateTrunc(isOverlapped, builder.getInt1Ty()); - builder.CreateCondBr(isOverlapped, processOverlapBlock, endProcessOverlapBlock); - - checkOverlapBlock->getTerminator()->eraseFromParent(); // Remove old terminator - } - - // Construct ".processOverlap" block - Value *waveIdRemapOffset = nullptr; - Value *newestOverlappedWaveId = nullptr; - { - builder.SetInsertPoint(processOverlapBlock->getTerminator()); - - auto packerId = builder.CreateAnd(builder.CreateLShr(collisionWaveId, 28), 0x3); - // POPS_PACKER: [0] Enable; [2:1] Packer ID - auto hwReg = [=](unsigned hwRegId, unsigned offset, unsigned size) { - // The HW register of s_setreg has this layout: - // [5:0] ID of HW register; [10:6] Offset; [15:11] Size - return ((hwRegId) | (offset << 6) | ((size - 1) << 11)); - }; - static const unsigned HwRegPopsPacker = 25; - auto popsPacker = builder.CreateOr(builder.CreateShl(packerId, 1), 0x1); - builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_s_setreg, - {builder.getInt32(hwReg(HwRegPopsPacker, 0, 3)), popsPacker}); - - // waveIdRemapOffset = -(currentWaveId + 1) = ~currentWaveId - auto currentWaveId = builder.CreateAnd(collisionWaveId, 0x3FF); - waveIdRemapOffset = builder.CreateNot(currentWaveId); - - // newestOverlappedWaveId += waveIdRemapOffset - newestOverlappedWaveId = builder.CreateAnd(builder.CreateLShr(collisionWaveId, 16), 0x3FF); - newestOverlappedWaveId = builder.CreateAdd(newestOverlappedWaveId, waveIdRemapOffset); - } - - // Construct ".waveWaitingHeader" block - { - builder.SetInsertPoint(waveWaitingHeaderBlock->getTerminator()); - - Value *popsExitingWaveId = - builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_pops_exiting_wave_id, {}); - popsExitingWaveId = builder.CreateAdd(popsExitingWaveId, waveIdRemapOffset); - - Value *needToWait = builder.CreateICmpULE(popsExitingWaveId, newestOverlappedWaveId); - builder.CreateCondBr(needToWait, waveWaitingBodyBlock, endProcessOverlapBlock); - - waveWaitingHeaderBlock->getTerminator()->eraseFromParent(); // Remove old terminator - } - - // Construct ".waveWaitingBody" block - { - builder.SetInsertPoint(waveWaitingBodyBlock->getTerminator()); - - static const unsigned WaitTime = 0xFFFF; - builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {}, builder.getInt32(WaitTime)); - - builder.CreateBr(waveWaitingHeaderBlock); - - waveWaitingBodyBlock->getTerminator()->eraseFromParent(); // Remove old terminator - } - - // Currently, nothing to do to construct ".endProcessOverlap" block -} - -// ===================================================================================================================== -// Lower PopsEndInterlockOp. -// -// @param popsEndInterlockOp : Call instruction op to end a POPS critical section -void PatchEntryPointMutate::lowerPopsEndInterlock(PopsEndInterlockOp &popsEndInterlockOp) { - Function *entryPoint = popsEndInterlockOp.getFunction(); - assert(getShaderStage(entryPoint) == ShaderStage::Fragment); // Must be FS - - BuilderBase builder(&popsEndInterlockOp); - - // - // The processing is something like this: - // - // Pre-GFX11: - // POPS_END_INTERLOCK() { - // s_wait_vscnt null, 0x0 - // s_sendmsg(MSG_ORDERED_PS_DONE) - // } - // - // GFX11+: - // POPS_END_INTERLOCK() { - // s_wait_vscnt null, 0x0 - // } - // - - // Add s_wait_vscnt null, 0x0 to make sure the completion of all writes - SyncScope::ID syncScope = builder.getContext().getOrInsertSyncScopeID("agent"); - builder.CreateFence(AtomicOrdering::Release, syncScope); - - auto gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion(); - if (gfxIp.major < 11) { - auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Fragment)->entryArgIdxs.fs; - auto primMask = getFunctionArgument(entryPoint, entryArgIdxs.primMask); - - builder.CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {builder.getInt32(OrderedPsDone), primMask}); - } -} - // ===================================================================================================================== // Lower as.continuation.reference call. // diff --git a/lgc/patch/PatchInOutImportExport.cpp b/lgc/patch/PatchInOutImportExport.cpp index f49e2b0a22..2040857824 100644 --- a/lgc/patch/PatchInOutImportExport.cpp +++ b/lgc/patch/PatchInOutImportExport.cpp @@ -37,6 +37,7 @@ #include "lgc/state/PalMetadata.h" #include "lgc/state/PipelineShaders.h" #include "lgc/util/Debug.h" +#include "lgc/util/WorkgroupLayout.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Support/Debug.h" @@ -442,12 +443,9 @@ void PatchInOutImportExport::processShader() { } } - if (m_shaderStage == ShaderStage::Compute) { - // In a compute shader, process lgc.reconfigure.local.invocation.id calls. - // This does not particularly have to be done here; it could be done anywhere after BuilderImpl. + if (m_shaderStage == ShaderStage::Compute || m_shaderStage == ShaderStage::Task) { + auto &mode = m_pipelineState->getShaderModes()->getComputeShaderMode(); for (Function &func : *m_module) { - auto &mode = m_pipelineState->getShaderModes()->getComputeShaderMode(); - // Different with above, this will force the threadID swizzle which will rearrange thread ID within a group into // blocks of 8*4, not to reconfig workgroup automatically and will support to be swizzled in 8*4 block // split. @@ -455,7 +453,7 @@ void PatchInOutImportExport::processShader() { unsigned workgroupSizeX = mode.workgroupSizeX; unsigned workgroupSizeY = mode.workgroupSizeY; unsigned workgroupSizeZ = mode.workgroupSizeZ; - SwizzleWorkgroupLayout layout = calculateWorkgroupLayout(); + SwizzleWorkgroupLayout layout = calculateWorkgroupLayout(m_pipelineState, m_shaderStage); while (!func.use_empty()) { CallInst *reconfigCall = cast(*func.user_begin()); Value *localInvocationId = reconfigCall->getArgOperand(0); @@ -464,9 +462,9 @@ void PatchInOutImportExport::processShader() { if ((layout.microLayout == WorkgroupLayout::Quads) || (layout.macroLayout == WorkgroupLayout::SexagintiQuads)) { BuilderBase builder(reconfigCall); - localInvocationId = - reconfigWorkgroupLayout(localInvocationId, layout.macroLayout, layout.microLayout, workgroupSizeX, - workgroupSizeY, workgroupSizeZ, isHwLocalInvocationId, builder); + localInvocationId = reconfigWorkgroupLayout( + localInvocationId, m_pipelineState, m_shaderStage, layout.macroLayout, layout.microLayout, + workgroupSizeX, workgroupSizeY, workgroupSizeZ, isHwLocalInvocationId, builder); } } reconfigCall->replaceAllUsesWith(localInvocationId); @@ -1308,6 +1306,11 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) { usePointSize || useLayer || useViewportIndex || useShadingRate || enableMultiView || useEdgeFlag; // NOTE: When misc. export is present, gl_ClipDistance[] or gl_CullDistance[] should start from pos2. unsigned pos = miscExport ? EXP_TARGET_POS_2 : EXP_TARGET_POS_1; + + unsigned clipPlaneMask = m_pipelineState->getOptions().clipPlaneMask; + bool needMapClipDistMask = ((clipPlaneMask != 0) && m_pipelineState->getOptions().enableMapClipDistMask); + assert(!m_pipelineState->getOptions().enableMapClipDistMask || ((clipPlaneMask & 0xF) == 0)); + Value *args[] = { builder.getInt32(pos), // tgt builder.getInt32(0xF), // en @@ -1319,19 +1322,22 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) { builder.getInt1(false) // vm }; - builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args); + if (!needMapClipDistMask) { + builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args); + pos++; + } if (clipCullDistance.size() > 4) { // Do the second exporting Value *args[] = { - builder.getInt32(pos + 1), // tgt - builder.getInt32(0xF), // en - clipCullDistance[4], // src0 - clipCullDistance[5], // src1 - clipCullDistance[6], // src2 - clipCullDistance[7], // src3 - builder.getInt1(false), // done - builder.getInt1(false) // vm + builder.getInt32(pos), // tgt + builder.getInt32(0xF), // en + clipCullDistance[4], // src0 + clipCullDistance[5], // src1 + clipCullDistance[6], // src2 + clipCullDistance[7], // src3 + builder.getInt1(false), // done + builder.getInt1(false) // vm }; builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args); } @@ -1863,6 +1869,11 @@ Value *PatchInOutImportExport::patchFsGenericInputImport(Type *inputTy, unsigned unsigned startChannel = 0; if (compIdx) { startChannel = cast(compIdx)->getZExtValue(); + if (bitWidth == 64) { + // NOTE: For 64-bit input, the component index is always 64-bit based while subsequent interpolation operations + // is dword-based. We have to change the start channel accordingly. + startChannel *= 2; + } assert((startChannel + numChannels) <= (bitWidth == 64 ? 8 : 4)); } @@ -2917,7 +2928,7 @@ void PatchInOutImportExport::patchVsBuiltInOutputExport(Value *output, unsigned const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Vertex); auto &builtInUsage = resUsage->builtInUsage.vs; - const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap; + auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap; switch (builtInId) { case BuiltInPosition: @@ -2929,6 +2940,7 @@ void PatchInOutImportExport::patchVsBuiltInOutputExport(Value *output, unsigned if (builtInId == BuiltInPointSize && (isa(output) || isa(output))) { // NOTE: gl_PointSize is always declared as a field of gl_PerVertex. We have to check the output // value to determine if it is actually referenced in shader. + builtInOutLocMap.erase(BuiltInPointSize); builtInUsage.pointSize = false; return; } @@ -2959,10 +2971,13 @@ void PatchInOutImportExport::patchVsBuiltInOutputExport(Value *output, unsigned if ((isa(output) || isa(output))) { // NOTE: gl_{Clip,Cull}Distance[] is always declared as a field of gl_PerVertex. We have to check the output // value to determine if it is actually referenced in shader. - if (builtInId == BuiltInClipDistance) + if (builtInId == BuiltInClipDistance) { + builtInOutLocMap.erase(BuiltInClipDistance); builtInUsage.clipDistance = 0; - else + } else { + builtInOutLocMap.erase(BuiltInCullDistance); builtInUsage.cullDistance = 0; + } return; } @@ -3193,7 +3208,7 @@ void PatchInOutImportExport::patchTcsBuiltInOutputExport(Value *output, unsigned void PatchInOutImportExport::patchTesBuiltInOutputExport(Value *output, unsigned builtInId, BuilderBase &builder) { const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessEval); auto &builtInUsage = resUsage->builtInUsage.tes; - const auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap; + auto &builtInOutLocMap = resUsage->inOutUsage.builtInOutputLocMap; switch (builtInId) { case BuiltInPosition: @@ -3211,15 +3226,19 @@ void PatchInOutImportExport::patchTesBuiltInOutputExport(Value *output, unsigned // value to determine if it is actually referenced in shader. switch (builtInId) { case BuiltInPosition: + builtInOutLocMap.erase(BuiltInPosition); builtInUsage.position = false; return; case BuiltInPointSize: + builtInOutLocMap.erase(BuiltInPointSize); builtInUsage.pointSize = false; return; case BuiltInClipDistance: + builtInOutLocMap.erase(BuiltInClipDistance); builtInUsage.clipDistance = 0; return; case BuiltInCullDistance: + builtInOutLocMap.erase(BuiltInCullDistance); builtInUsage.cullDistance = 0; return; default: @@ -4075,13 +4094,10 @@ Value *PatchInOutImportExport::calcEsGsRingOffsetForOutput(unsigned location, un assert(m_pipelineState->hasShaderStage(ShaderStage::Geometry)); const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor; - esGsOffset = builder.CreateLShr(esGsOffset, builder.getInt32(2)); - Value *ringOffset = builder.CreateMul(m_threadId, builder.getInt32(calcFactor.esGsRingItemSize)); - ringOffset = builder.CreateAdd(ringOffset, esGsOffset); - ringOffset = builder.CreateAdd(ringOffset, builder.getInt32(location * 4 + compIdx)); + return ringOffset; } @@ -4148,8 +4164,8 @@ Value *PatchInOutImportExport::calcGsVsRingOffsetForOutput(unsigned location, un unsigned streamBase = 0; for (int i = 0; i < MaxGsStreams; ++i) { streamBases[i] = streamBase; - streamBase += (resUsage->inOutUsage.gs.outLocCount[i] * - m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices * 4); + streamBase += (resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] * + m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices); } if (m_pipelineState->isGsOnChip()) { @@ -4166,8 +4182,8 @@ Value *PatchInOutImportExport::calcGsVsRingOffsetForOutput(unsigned location, un builder.CreateMul(m_threadId, builder.getInt32(resUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize)); // VertexSize is stream output vertexSize x 4 (in dwords) - unsigned vertexSize = resUsage->inOutUsage.gs.outLocCount[streamId] * 4; - auto vertexItemOffset = builder.CreateMul(vertexIdx, builder.getInt32(vertexSize)); + unsigned vertexItemSize = resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[streamId]; + auto vertexItemOffset = builder.CreateMul(vertexIdx, builder.getInt32(vertexItemSize)); ringOffset = builder.CreateAdd(esGsLdsSize, gsVsOffset); ringOffset = builder.CreateAdd(ringOffset, ringItemOffset); ringOffset = builder.CreateAdd(ringOffset, vertexItemOffset); @@ -4176,7 +4192,6 @@ Value *PatchInOutImportExport::calcGsVsRingOffsetForOutput(unsigned location, un ringOffset = builder.CreateAdd(ringOffset, builder.getInt32(attribOffset)); } else { // ringOffset = ((location * 4 + compIdx) * maxVertices + vertexIdx) * 4 (in bytes); - unsigned outputVertices = m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices; ringOffset = builder.CreateAdd(vertexIdx, builder.getInt32((location * 4 + compIdx) * outputVertices)); @@ -4868,140 +4883,6 @@ Value *PatchInOutImportExport::getSubgroupLocalInvocationId(BuilderBase &builder return subgroupLocalInvocationId; } -// ===================================================================================================================== -// Do automatic workgroup size reconfiguration in a compute shader, to allow ReconfigWorkgroupLayout -// to apply optimizations. -SwizzleWorkgroupLayout PatchInOutImportExport::calculateWorkgroupLayout() { - auto &mode = m_pipelineState->getShaderModes()->getComputeShaderMode(); - SwizzleWorkgroupLayout resultLayout = {WorkgroupLayout::Unknown, WorkgroupLayout::Unknown}; - - if (m_shaderStage == ShaderStage::Compute) { - auto &resUsage = *m_pipelineState->getShaderResourceUsage(ShaderStage::Compute); - if (resUsage.builtInUsage.cs.foldWorkgroupXY) { - llvm_unreachable("Should never be called!"); - } - - if (mode.derivatives == DerivativeMode::Quads) { - resultLayout.microLayout = WorkgroupLayout::Quads; - } else if (mode.derivatives == DerivativeMode::Linear) { - resultLayout.microLayout = WorkgroupLayout::Linear; - } - - if (m_pipelineState->getOptions().forceCsThreadIdSwizzling) { - if ((mode.workgroupSizeX >= 16) && (mode.workgroupSizeX % 8 == 0) && (mode.workgroupSizeY % 4 == 0)) { - resultLayout.macroLayout = WorkgroupLayout::SexagintiQuads; - } - } - - // If no configuration has been specified, apply a reconfigure if the compute shader uses images and the - // pipeline option was enabled. - if (m_pipelineState->getOptions().reconfigWorkgroupLayout) { - if ((mode.workgroupSizeX % 2) == 0 && (mode.workgroupSizeY % 2) == 0) { - if (mode.workgroupSizeX % 8 == 0) { - // It can be reconfigured into 8 X N - if (resultLayout.macroLayout == WorkgroupLayout::Unknown) { - resultLayout.macroLayout = WorkgroupLayout::SexagintiQuads; - } - } else { - // If our local size in the X & Y dimensions are multiples of 2, we can reconfigure. - if (resultLayout.microLayout == WorkgroupLayout::Unknown) { - resultLayout.microLayout = WorkgroupLayout::Quads; - } - } - } - } - } - return resultLayout; -} - -// ===================================================================================================================== -// Reconfigure the workgroup for optimization purposes. -// @param localInvocationId : This is a v3i32 shader input (three VGPRs set up in hardware). -// @param macroLayout : Swizzle the thread id into macroLayout from macro level -// @param microLayout : Swizzle the thread id into microLayout from micro level -// @param workgroupSizeX : WorkgroupSize X for thread Id numbers -// @param workgroupSizeY : WorkgroupSize Y for thread Id numbers -// @param workgroupSizeZ : WorkgroupSize Z for thread Id numbers -// @param isHwLocalInvocationId : identify whether the localInvocationId is builtInLocalInvcocationId or -// BuiltInUnswizzledLocalInvocationId -// @param builder : the builder to use -Value *PatchInOutImportExport::reconfigWorkgroupLayout(Value *localInvocationId, WorkgroupLayout macroLayout, - WorkgroupLayout microLayout, unsigned workgroupSizeX, - unsigned workgroupSizeY, unsigned workgroupSizeZ, - bool isHwLocalInvocationId, BuilderBase &builder) { - Value *apiX = builder.getInt32(0); - Value *apiY = builder.getInt32(0); - Value *newLocalInvocationId = PoisonValue::get(localInvocationId->getType()); - unsigned bitsX = 0; - unsigned bitsY = 0; - auto &resUsage = *m_pipelineState->getShaderResourceUsage(ShaderStage::Compute); - resUsage.builtInUsage.cs.foldWorkgroupXY = true; - - Value *tidXY = builder.CreateExtractElement(localInvocationId, builder.getInt32(0), "tidXY"); - Value *apiZ = builder.getInt32(0); - if (workgroupSizeZ > 1) { - apiZ = builder.CreateExtractElement(localInvocationId, builder.getInt32(1), "tidZ"); - } - // For BuiltInUnswizzledLocalInvocationId, it shouldn't swizzle and return the localInvocation without - // foldXY. - if (isHwLocalInvocationId) { - apiX = builder.CreateURem(tidXY, builder.getInt32(workgroupSizeX)); - apiY = builder.CreateUDiv(tidXY, builder.getInt32(workgroupSizeX)); - } else { - // Micro-tiling with quad:2x2, the thread-id will be marked as {<0,0>,<1,0>,<0,1>,<1,1>} - // for each quad. Each 4 threads will be wrapped in the same tid. - if (microLayout == WorkgroupLayout::Quads) { - apiX = builder.CreateAnd(tidXY, builder.getInt32(1)); - apiY = builder.CreateAnd(builder.CreateLShr(tidXY, builder.getInt32(1)), builder.getInt32(1)); - tidXY = builder.CreateLShr(tidXY, builder.getInt32(2)); - bitsX = 1; - bitsY = 1; - } - - // Macro-tiling with 8xN block - if (macroLayout == WorkgroupLayout::SexagintiQuads) { - unsigned bits = 3 - bitsX; - Value *subTileApiX = builder.CreateAnd(tidXY, builder.getInt32((1 << bits) - 1)); - subTileApiX = builder.CreateShl(subTileApiX, builder.getInt32(bitsX)); - apiX = builder.CreateOr(apiX, subTileApiX); - - // 1. Folding 4 threads as one tid if micro-tiling with quad before. - // After the folding, each 4 hwThreadIdX share the same tid after tid>>=bits. - // For example: hwThreadId.X = 0~3, the tid will be 0; will be {<0,0>,<1,0>,<0,1>,<1,1>} - // hwThreadId.X = 4~7, the tid will be 1; will be {<0,0>,<1,0>,<0,1>,<1,1>} - // 2. Folding 8 threads as one tid without any micro-tiling before. - // After the folding, each 8 hwThreadIdX share the same tid after tid>>=bits and only apiX are calculated. - // For example: hwThreadId.X = 0~7, tid = hwThreadId.X/8 = 0; will be {0,1,...,7} - // hwThreadId.X = 8~15, tid = hwThreadId.X/8 = 1; will be {0,1,...,7} - tidXY = builder.CreateLShr(tidXY, builder.getInt32(bits)); - bitsX = 3; - - // 1. Unfolding 4 threads, it needs to set walkY = workgroupSizeY/2 as these threads are wrapped in 2X2 size. - // 2. Unfolding 8 threads, it needs to set walkY = workgroupSizeY/2 as these threads are wrapped in 1x8 size. - // After unfolding these threads, it needs '| apiX and | apiY' to calculated each thread's coordinate - // in the unfolded wrap threads. - unsigned walkY = workgroupSizeY >> bitsY; - Value *tileApiY = builder.CreateShl(builder.CreateURem(tidXY, builder.getInt32(walkY)), builder.getInt32(bitsY)); - apiY = builder.CreateOr(apiY, tileApiY); - Value *tileApiX = builder.CreateShl(builder.CreateUDiv(tidXY, builder.getInt32(walkY)), builder.getInt32(bitsX)); - apiX = builder.CreateOr(apiX, tileApiX); - } else { - // Update the coordinates for each 4 wrap-threads then unfold each thread to calculate the coordinate by '| apiX - // and | apiY' - unsigned walkX = workgroupSizeX >> bitsX; - Value *tileApiX = builder.CreateShl(builder.CreateURem(tidXY, builder.getInt32(walkX)), builder.getInt32(bitsX)); - apiX = builder.CreateOr(apiX, tileApiX); - Value *tileApiY = builder.CreateShl(builder.CreateUDiv(tidXY, builder.getInt32(walkX)), builder.getInt32(bitsY)); - apiY = builder.CreateOr(apiY, tileApiY); - } - } - - newLocalInvocationId = builder.CreateInsertElement(newLocalInvocationId, apiX, uint64_t(0)); - newLocalInvocationId = builder.CreateInsertElement(newLocalInvocationId, apiY, uint64_t(1)); - newLocalInvocationId = builder.CreateInsertElement(newLocalInvocationId, apiZ, uint64_t(2)); - return newLocalInvocationId; -} - // ===================================================================================================================== // Creates the LGC intrinsic "lgc.swizzle.thread.group" to swizzle thread group for optimization purposes. // @@ -5098,8 +4979,8 @@ void PatchInOutImportExport::createSwizzleThreadGroupFunction() { Value *nativeWorkgroupId = argIt++; nativeWorkgroupId->setName("nativeWorkgroupId"); - static constexpr unsigned tileDims[] = {InvalidValue, 4, 8, 16}; - static constexpr unsigned tileBits[] = {InvalidValue, 2, 3, 4}; + static constexpr unsigned tileDims[] = {InvalidValue, 4, 8, 16, 32, 64}; + static constexpr unsigned tileBits[] = {InvalidValue, 2, 3, 4, 5, 6}; static_assert((sizeof(tileDims) / sizeof(unsigned)) == static_cast(ThreadGroupSwizzleMode::Count), "The length of tileDims is not as expected."); static_assert((sizeof(tileBits) / sizeof(unsigned)) == static_cast(ThreadGroupSwizzleMode::Count), diff --git a/lgc/patch/PatchResourceCollect.cpp b/lgc/patch/PatchResourceCollect.cpp index b2c79d1f78..6ba2a49442 100644 --- a/lgc/patch/PatchResourceCollect.cpp +++ b/lgc/patch/PatchResourceCollect.cpp @@ -443,6 +443,8 @@ bool PatchResourceCollect::checkGsOnChipValidity() { unsigned inVertsPerPrim = 0; bool useAdjacency = false; + unsigned gsVsVertexItemSize[MaxGsStreams] = {}; + if (hasGs) { switch (geometryMode.inputPrimitive) { case InputPrimitives::Points: @@ -533,8 +535,14 @@ bool PatchResourceCollect::checkGsOnChipValidity() { m_pipelineState, m_pipelineShaders->getEntryPoint(hasTs ? ShaderStage::TessEval : ShaderStage::Vertex)); // In dwords - const unsigned gsVsRingItemSize = - hasGs ? std::max(1u, 4 * gsResUsage->inOutUsage.outputMapLocCount * geometryMode.outputVertices) : 0; + // NOTE: Make gsVsVertexItemSize odd by "| 1", to optimize GS -> VS ring layout for LDS bank conflicts. + unsigned gsVsVertexItemTotalSize = 0; + for (int i = 0; i < MaxGsStreams; ++i) { + gsVsVertexItemSize[i] = (4 * gsResUsage->inOutUsage.gs.outLocCount[i]) | 1; + gsVsVertexItemTotalSize += gsVsVertexItemSize[i]; + } + + const unsigned gsVsRingItemSize = hasGs ? std::max(1u, gsVsVertexItemTotalSize * geometryMode.outputVertices) : 0; const auto &ldsGeneralUsage = NggPrimShader::layoutPrimShaderLds(m_pipelineState); const bool needsLds = ldsGeneralUsage.needsLds; @@ -728,6 +736,10 @@ bool PatchResourceCollect::checkGsOnChipValidity() { gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize = esGsRingItemSize; gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize = gsVsRingItemSize; + for (int i = 0; i < MaxGsStreams; ++i) { + gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] = gsVsVertexItemSize[i]; + } + gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor = primAmpFactor; gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut = enableMaxVertOut; gsResUsage->inOutUsage.gs.calcFactor.rayQueryLdsStackSize = rayQueryLdsStackSize; @@ -742,8 +754,13 @@ bool PatchResourceCollect::checkGsOnChipValidity() { // NOTE: Make esGsRingItemSize odd by "| 1", to optimize ES -> GS ring layout for LDS bank conflicts. const unsigned esGsRingItemSize = (4 * std::max(1u, gsResUsage->inOutUsage.inputMapLocCount)) | 1; - const unsigned gsVsRingItemSize = - 4 * std::max(1u, (gsResUsage->inOutUsage.outputMapLocCount * geometryMode.outputVertices)); + unsigned gsVsVertexItemTotalSize = 0; + for (int i = 0; i < MaxGsStreams; ++i) { + gsVsVertexItemSize[i] = (4 * gsResUsage->inOutUsage.gs.outLocCount[i]); + gsVsVertexItemTotalSize += gsVsVertexItemSize[i]; + } + + const unsigned gsVsRingItemSize = std::max(1u, (gsVsVertexItemTotalSize * geometryMode.outputVertices)); // NOTE: Make gsVsRingItemSize odd by "| 1", to optimize GS -> VS ring layout for LDS bank conflicts. const unsigned gsVsRingItemSizeOnChip = gsVsRingItemSize | 1; @@ -898,6 +915,10 @@ bool PatchResourceCollect::checkGsOnChipValidity() { gsResUsage->inOutUsage.gs.calcFactor.esGsRingItemSize = esGsRingItemSize; gsResUsage->inOutUsage.gs.calcFactor.gsVsRingItemSize = gsOnChip ? gsVsRingItemSizeOnChip : gsVsRingItemSize; + for (int i = 0; i < MaxGsStreams; ++i) { + gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] = gsVsVertexItemSize[i]; + } + if (m_pipelineState->getTargetInfo().getGfxIpVersion().major == 10 && hasTs && !gsOnChip) { unsigned esVertsNum = EsVertsOffchipGsOrTess; unsigned onChipGsLdsMagicSize = (esVertsNum * esGsRingItemSize) + esGsExtraLdsDwords; @@ -945,7 +966,8 @@ bool PatchResourceCollect::checkGsOnChipValidity() { if (hasGs) { LLPC_OUTS("GS stream item sizes (in dwords):\n"); for (unsigned i = 0; i < MaxGsStreams; ++i) { - unsigned streamItemSize = gsResUsage->inOutUsage.gs.outLocCount[i] * geometryMode.outputVertices * 4; + unsigned streamItemSize = + gsResUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] * geometryMode.outputVertices; LLPC_OUTS(" stream[" << i << "] = " << streamItemSize); if (m_pipelineState->enableXfb()) { @@ -2837,6 +2859,18 @@ void PatchResourceCollect::clearUnusedOutput() { ++locInfoMapIt; } } + + // Remove unused output for unlinked vs shader + if (m_pipelineState->isUnlinked() && m_shaderStage == ShaderStage::Vertex) { + SmallVector unusedLocInfos; + for (auto &locInfoPair : outputLocInfoMap) { + if ((!m_outputCallLocations.contains(locInfoPair.first.getLocation())) && + (locInfoPair.first.getLocation() >= MaxInOutLocCount)) + unusedLocInfos.push_back(locInfoPair.first); + } + for (auto &locInfo : unusedLocInfos) + outputLocInfoMap.erase(locInfo); + } } // ===================================================================================================================== @@ -3692,11 +3726,19 @@ void PatchResourceCollect::clearUndefinedOutput() { } } m_outputCalls.clear(); + m_outputCallLocations.clear(); // Check if all used channels are undefined in a location in a stream for (auto &locCandidate : locCandidateInfoMap) { auto candidateCalls = locCandidate.second.candidateCalls; if (locCandidate.second.usedMask != locCandidate.second.undefMask) { m_outputCalls.insert(m_outputCalls.end(), candidateCalls.begin(), candidateCalls.end()); + for (auto call : candidateCalls) { + assert(call->arg_size()); + Value *locArg = call->arg_begin()->get(); + ConstantInt &locConst = cast(*locArg); + unsigned locVal = locConst.getZExtValue(); + m_outputCallLocations.insert(locVal); + } continue; } diff --git a/lgc/patch/RegisterMetadataBuilder.cpp b/lgc/patch/RegisterMetadataBuilder.cpp index 27f9662d5f..476a641d93 100644 --- a/lgc/patch/RegisterMetadataBuilder.cpp +++ b/lgc/patch/RegisterMetadataBuilder.cpp @@ -33,6 +33,7 @@ #include "lgc/state/PalMetadata.h" #include "lgc/state/PipelineState.h" #include "lgc/state/TargetInfo.h" +#include "llvm/ADT/SmallSet.h" #define DEBUG_TYPE "lgc-register-metadata-builder" @@ -129,60 +130,43 @@ void RegisterMetadataBuilder::buildPalMetadata() { if (hwStageMask & (Util::Abi::HwShaderGs | Util::Abi::HwShaderVs)) buildPaSpecificRegisters(); + // Build output semantics for part pipeline if (lastVertexProcessingStage && m_pipelineState->isUnlinked()) { - // Fill ".preraster_output_semantic" + // We collect output semantics for generic outputs and necessary built-ins that will be exported to PS as generic + // outputs (ClipDistance, CullDistance, Layer, ViewportIndex, PrimitiveId). + std::map outputSemantics; auto resUsage = m_pipelineState->getShaderResourceUsage(lastVertexProcessingStage.value()); - auto &outputLocInfoMap = resUsage->inOutUsage.outputLocInfoMap; - auto &perPrimitiveOutputLocMap = resUsage->inOutUsage.perPrimitiveOutputLocMap; - auto &builtInOutputLocMap = resUsage->inOutUsage.builtInOutputLocMap; - auto &perPrimitiveBuiltInOutputLocMap = resUsage->inOutUsage.perPrimitiveBuiltInOutputLocMap; - - // Collect semantic info for generic input and builtIns {ClipDistance, CulDistance, Layer, - // ViewportIndex, PrimitiveId} that exports via generic output as well. - if (!outputLocInfoMap.empty() || !perPrimitiveOutputLocMap.empty() || !builtInOutputLocMap.empty() || - !perPrimitiveBuiltInOutputLocMap.empty()) { - auto preRasterOutputSemanticNode = - getPipelineNode()[Util::Abi::PipelineMetadataKey::PrerasterOutputSemantic].getArray(true); - unsigned elemIdx = 0; - - for (auto locInfoPair : outputLocInfoMap) { - auto preRasterOutputSemanticElem = preRasterOutputSemanticNode[elemIdx].getMap(true); - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Semantic] = - MaxBuiltInSemantic + locInfoPair.first.getLocation(); - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Index] = - locInfoPair.second.getLocation(); - ++elemIdx; - } - for (auto locInfoPair : perPrimitiveOutputLocMap) { - auto preRasterOutputSemanticElem = preRasterOutputSemanticNode[elemIdx].getMap(true); - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Semantic] = - MaxBuiltInSemantic + locInfoPair.first; - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Index] = locInfoPair.second; - ++elemIdx; + for (auto locMap : resUsage->inOutUsage.outputLocInfoMap) + outputSemantics[locMap.first.getLocation()] = locMap.second.getLocation(); + + for (auto locMap : resUsage->inOutUsage.perPrimitiveOutputLocMap) + outputSemantics[locMap.first] = locMap.second; + + for (auto locMap : resUsage->inOutUsage.builtInOutputLocMap) { + if (locMap.first == BuiltInClipDistance || locMap.first == BuiltInCullDistance || + locMap.first == BuiltInLayer || locMap.first == BuiltInViewportIndex || + locMap.first == BuiltInPrimitiveId) { + outputSemantics[BuiltInSemanticMask | locMap.first] = locMap.second; } + } - for (auto locPair : builtInOutputLocMap) { - if (locPair.first == BuiltInClipDistance || locPair.first == BuiltInCullDistance || - locPair.first == BuiltInLayer || locPair.first == BuiltInViewportIndex || - locPair.first == BuiltInPrimitiveId) { - assert(locPair.first < MaxBuiltInSemantic); - auto preRasterOutputSemanticElem = preRasterOutputSemanticNode[elemIdx].getMap(true); - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Semantic] = locPair.first; - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Index] = locPair.second; - ++elemIdx; - } + for (auto locMap : resUsage->inOutUsage.perPrimitiveBuiltInOutputLocMap) { + if (locMap.first == BuiltInLayer || locMap.first == BuiltInViewportIndex || + locMap.first == BuiltInPrimitiveId) { + outputSemantics[BuiltInSemanticMask | locMap.first] = locMap.second; } + } - for (auto locPair : perPrimitiveBuiltInOutputLocMap) { - if (locPair.first == BuiltInLayer || locPair.first == BuiltInViewportIndex || - locPair.first == BuiltInPrimitiveId) { - assert(locPair.first < MaxBuiltInSemantic); - auto preRasterOutputSemanticElem = preRasterOutputSemanticNode[elemIdx].getMap(true); - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Semantic] = locPair.first; - preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Index] = locPair.second; - ++elemIdx; - } + // Fill the PAL metadata ".preraster_output_semantic" used by later pipeline linking. + if (!outputSemantics.empty()) { + auto prerasterOutputSemantics = + getPipelineNode()[Util::Abi::PipelineMetadataKey::PrerasterOutputSemantic].getArray(true); + unsigned element = 0; + for (auto outputSemantic : outputSemantics) { + auto prerasterOutputSemantic = prerasterOutputSemantics[element++].getMap(true); + prerasterOutputSemantic[Util::Abi::PrerasterOutputSemanticMetadataKey::Semantic] = outputSemantic.first; + prerasterOutputSemantic[Util::Abi::PrerasterOutputSemanticMetadataKey::Index] = outputSemantic.second; } } } @@ -327,7 +311,7 @@ void RegisterMetadataBuilder::buildEsGsRegisters() { const unsigned itemCount = 4; unsigned gsVsRingOffset = 0; for (unsigned i = 0; i < itemCount; ++i) { - unsigned itemSize = sizeof(unsigned) * gsInOutUsage.gs.outLocCount[i]; + unsigned itemSize = gsInOutUsage.gs.calcFactor.gsVsVertexItemSize[i]; itemSizeArrayNode[i] = itemSize; if (i < itemCount - 1) { gsVsRingOffset += itemSize * maxVertOut; @@ -576,9 +560,17 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() { // SPI_SHADER_GS_MESHLET_DIM auto spiShaderGsMeshletDim = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::SpiShaderGsMeshletDim].getMap(true); - spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadX] = meshMode.workgroupSizeX - 1; - spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadY] = meshMode.workgroupSizeY - 1; - spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadZ] = meshMode.workgroupSizeZ - 1; + + if (meshBuiltInUsage.foldWorkgroupXY) { + spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadX] = + meshMode.workgroupSizeX * meshMode.workgroupSizeY - 1; + spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadY] = meshMode.workgroupSizeZ - 1; + spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadZ] = 0; + } else { + spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadX] = meshMode.workgroupSizeX - 1; + spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadY] = meshMode.workgroupSizeY - 1; + spiShaderGsMeshletDim[Util::Abi::SpiShaderGsMeshletDimMetadataKey::NumThreadZ] = meshMode.workgroupSizeZ - 1; + } // NOTE: If row export for mesh shader is enabled, the thread group size is set according to dimensions of work // group. Otherwise, it is set according to actual primitive amplification factor. const unsigned threadGroupSize = m_pipelineState->enableMeshRowExport() @@ -596,9 +588,6 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() { } } else { maxVertsPerSubgroup = std::min(gsInstPrimsInSubgrp * maxVertOut, NggMaxThreadsPerSubgroup); - // VGT_GS_VERT_ITEMSIZE - getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtGsVertItemsize] = - 4 * gsInOutUsage.outputMapLocCount; // VGT_GS_INSTANCE_CNT if (geometryMode.invocations > 1 || gsBuiltInUsage.invocationId) { @@ -612,9 +601,6 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() { } if (m_gfxIp.major <= 11) { - // VGT_GSVS_RING_ITEMSIZE - getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtGsvsRingItemsize] = calcFactor.gsVsRingItemSize; - // VGT_ESGS_RING_ITEMSIZE getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtEsgsRingItemsize] = (m_hasGs ? calcFactor.esGsRingItemSize : 1); @@ -966,32 +952,38 @@ void RegisterMetadataBuilder::buildPsRegisters() { hwShaderNode[Util::Abi::HardwareStageMetadataKey::UsesUavs] = resUsage->resourceWrite; } - // Fill .ps_input_semantic for partial pipeline + // Build input semantics for part pipeline if (m_pipelineState->isUnlinked()) { - // Collect semantic info for generic input and builtIns {ClipDistance, CulDistance, Layer, - // ViewportIndex, PrimitiveId} that exports via generic output as well. - auto &inputLocInfoMap = resUsage->inOutUsage.inputLocInfoMap; - auto &builtInInputLocMap = resUsage->inOutUsage.builtInInputLocMap; - - if (!inputLocInfoMap.empty() || !builtInInputLocMap.empty()) { - auto psInputSemanticNode = getPipelineNode()[Util::Abi::PipelineMetadataKey::PsInputSemantic].getArray(true); - unsigned elemIdx = 0; - for (auto locInfoPair : inputLocInfoMap) { - auto psInputSemanticElem = psInputSemanticNode[elemIdx].getMap(true); - psInputSemanticElem[Util::Abi::PsInputSemanticMetadataKey::Semantic] = - MaxBuiltInSemantic + locInfoPair.first.getLocation(); - ++elemIdx; + // We collect input semantics for generic inputs and necessary built-ins that will be exported from last vertex + // processing stage as generic inputs (ClipDistance, CullDistance, Layer, ViewportIndex, PrimitiveId). + SmallSet inputSemantics; + + for (auto locMap : resUsage->inOutUsage.inputLocInfoMap) + inputSemantics.insert(locMap.first.getLocation()); + + for (auto locMap : resUsage->inOutUsage.perPrimitiveInputLocMap) + inputSemantics.insert(locMap.first); + + for (auto locMap : resUsage->inOutUsage.builtInInputLocMap) { + if (locMap.first == BuiltInClipDistance || locMap.first == BuiltInCullDistance || locMap.first == BuiltInLayer || + locMap.first == BuiltInViewportIndex || locMap.first == BuiltInPrimitiveId) { + inputSemantics.insert(BuiltInSemanticMask | locMap.first); } + } - for (auto locPair : builtInInputLocMap) { - if (locPair.first == BuiltInClipDistance || locPair.first == BuiltInCullDistance || - locPair.first == BuiltInLayer || locPair.first == BuiltInViewportIndex || - locPair.first == BuiltInPrimitiveId) { - assert(locPair.first < MaxBuiltInSemantic); - auto psInputSemanticElem = psInputSemanticNode[elemIdx].getMap(true); - psInputSemanticElem[Util::Abi::PsInputSemanticMetadataKey::Semantic] = locPair.first; - ++elemIdx; - } + for (auto locMap : resUsage->inOutUsage.perPrimitiveBuiltInInputLocMap) { + if (locMap.first == BuiltInLayer || locMap.first == BuiltInViewportIndex || locMap.first == BuiltInPrimitiveId) { + inputSemantics.insert(BuiltInSemanticMask | locMap.first); + } + } + + // Fill the PAL metadata ".ps_input_semantic" used by later pipeline linking. + if (!inputSemantics.empty()) { + auto psInputSemantics = getPipelineNode()[Util::Abi::PipelineMetadataKey::PsInputSemantic].getArray(true); + unsigned element = 0; + for (auto inputSemantic : inputSemantics) { + auto psInputSemantic = psInputSemantics[element++].getMap(true); + psInputSemantic[Util::Abi::PsInputSemanticMetadataKey::Semantic] = inputSemantic; } } } @@ -1027,19 +1019,20 @@ void RegisterMetadataBuilder::buildCsRegisters(ShaderStageEnum shaderStage) { const auto &computeMode = m_pipelineState->getShaderModes()->getComputeShaderMode(); unsigned workgroupSizes[3] = {}; + bool foldWorkgroupXY = false; if (shaderStage == ShaderStage::Compute) { const auto &builtInUsage = resUsage->builtInUsage.cs; - if (builtInUsage.foldWorkgroupXY) { - workgroupSizes[0] = computeMode.workgroupSizeX * computeMode.workgroupSizeY; - workgroupSizes[1] = computeMode.workgroupSizeZ; - workgroupSizes[2] = 1; - } else { - workgroupSizes[0] = computeMode.workgroupSizeX; - workgroupSizes[1] = computeMode.workgroupSizeY; - workgroupSizes[2] = computeMode.workgroupSizeZ; - } - } else { + foldWorkgroupXY = builtInUsage.foldWorkgroupXY; + } else if (shaderStage == ShaderStage::Task) { assert(shaderStage == ShaderStage::Task); + const auto &builtInUsage = resUsage->builtInUsage.task; + foldWorkgroupXY = builtInUsage.foldWorkgroupXY; + } + if (foldWorkgroupXY) { + workgroupSizes[0] = computeMode.workgroupSizeX * computeMode.workgroupSizeY; + workgroupSizes[1] = computeMode.workgroupSizeZ; + workgroupSizes[2] = 1; + } else { workgroupSizes[0] = computeMode.workgroupSizeX; workgroupSizes[1] = computeMode.workgroupSizeY; workgroupSizes[2] = computeMode.workgroupSizeZ; @@ -1316,11 +1309,12 @@ void RegisterMetadataBuilder::buildPaSpecificRegisters() { paClVsOutCntl[Util::Abi::PaClVsOutCntlMetadataKey::VsOutMiscSideBusEna] = true; } + unsigned clipPlaneMask = m_pipelineState->getOptions().clipPlaneMask; + bool needMapClipDistMask = ((clipPlaneMask != 0) && m_pipelineState->getOptions().enableMapClipDistMask); if (clipDistanceCount > 0 || cullDistanceCount > 0) { auto paClVsOutCntl = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::PaClVsOutCntl].getMap(true); paClVsOutCntl[Util::Abi::PaClVsOutCntlMetadataKey::VsOutCcDist0VecEna] = true; - - if (clipDistanceCount + cullDistanceCount > 4) + if ((clipDistanceCount + cullDistanceCount > 4) && !needMapClipDistMask) paClVsOutCntl[Util::Abi::PaClVsOutCntlMetadataKey::VsOutCcDist1VecEna] = true; unsigned clipDistanceMask = (1 << clipDistanceCount) - 1; @@ -1335,6 +1329,14 @@ void RegisterMetadataBuilder::buildPaSpecificRegisters() { // Note: Point primitives are only affected by the cull mask, so enable culling also based on clip distances cullDistEna[i] = ((clipDistanceMask | cullDistanceMask) >> i) & 0x1; } + + // Map CLIP_DIST_4/5/6/7 to CLIP_DIST_0/1/2/3 accordingly. + if (needMapClipDistMask) { + for (unsigned i = 0; i < 4; ++i) { + clipDistEna[i] = clipDistEna[i + 4]; + } + } + paClVsOutCntl[Util::Abi::PaClVsOutCntlMetadataKey::ClipDistEna_0] = clipDistEna[0]; paClVsOutCntl[Util::Abi::PaClVsOutCntlMetadataKey::ClipDistEna_1] = clipDistEna[1]; paClVsOutCntl[Util::Abi::PaClVsOutCntlMetadataKey::ClipDistEna_2] = clipDistEna[2]; @@ -1384,7 +1386,7 @@ void RegisterMetadataBuilder::buildPaSpecificRegisters() { if (clipDistanceCount + cullDistanceCount > 0) { ++availPosCount; - if (clipDistanceCount + cullDistanceCount > 4) + if ((clipDistanceCount + cullDistanceCount > 4) && !needMapClipDistMask) ++availPosCount; } auto arrayNode = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::SpiShaderPosFormat].getArray(true); diff --git a/lgc/patch/ShaderInputs.cpp b/lgc/patch/ShaderInputs.cpp index a79d7e975f..0c6a1253a3 100644 --- a/lgc/patch/ShaderInputs.cpp +++ b/lgc/patch/ShaderInputs.cpp @@ -764,9 +764,9 @@ ShaderInputs::ShaderInputUsage *ShaderInputs::getShaderInputUsage(ShaderStageEnu void ShaderInputs::tryOptimizeWorkgroupId(PipelineState *pipelineState, ShaderStageEnum shaderStage, Function *origFunc) { assert(shaderStage == ShaderStage::Compute); - bool useWholeWorkgroupId = false; - SmallVector extractVec3[3]; + std::array compUsedCounts = {}; SmallVector workgroupIdCallInsts; + SmallVector, 3>> extractInstsForUsers; unsigned kindId = static_cast(ShaderInput::WorkgroupId); ShaderInputUsage *inputUsage = getShaderInputsUsage(shaderStage)->inputs[kindId].get(); @@ -776,82 +776,95 @@ void ShaderInputs::tryOptimizeWorkgroupId(PipelineState *pipelineState, ShaderSt for (Instruction *&call : inputUsage->users) { if (!call) continue; + SmallVector, 3> extractInsts; for (auto user : call->users()) { if (auto extractInst = dyn_cast(user)) { if (auto indexInst = dyn_cast(extractInst->getIndexOperand())) { unsigned index = indexInst->getZExtValue(); assert(index < 3); - extractVec3[index].push_back(extractInst); + ++compUsedCounts[index]; + extractInsts.emplace_back(extractInst, index); continue; } } - useWholeWorkgroupId = true; - break; + // Use whole workgroupId components and keep unchanged + return; } - if (!useWholeWorkgroupId) - workgroupIdCallInsts.push_back(call); + workgroupIdCallInsts.push_back(call); + extractInstsForUsers.emplace_back(extractInsts); } - if (!useWholeWorkgroupId) { - for (auto insts : extractVec3) { - if (!insts.empty()) - ++usedCompCount; - } + for (auto num : compUsedCounts) { + if (num > 0) + ++usedCompCount; } - if (usedCompCount == 3) - useWholeWorkgroupId = true; } - if (useWholeWorkgroupId) + // Use whole workgroupId components + if (usedCompCount == 3) return; - if (extractVec3[0].empty()) + if (compUsedCounts[0] == 0) origFunc->addFnAttr("amdgpu-no-workgroup-id-x"); - if (extractVec3[1].empty()) + if (compUsedCounts[1] == 0) origFunc->addFnAttr("amdgpu-no-workgroup-id-y"); - if (extractVec3[2].empty()) + if (compUsedCounts[2] == 0) origFunc->addFnAttr("amdgpu-no-workgroup-id-z"); if (!inputUsage) return; - BuilderBase builder(pipelineState->getContext()); - builder.SetInsertPoint(workgroupIdCallInsts.front()); - // Clear the original default <3xi32> workgroupId - inputUsage->users.clear(); - inputUsage = nullptr; - - if (usedCompCount == 1) { - // The processing of using one component - auto workgroupId1 = - static_cast(getInput(ShaderInput::WorkgroupId1, builder, *pipelineState->getLgcContext())); - getShaderInputUsage(shaderStage, ShaderInput::WorkgroupId1)->users.push_back(workgroupId1); + // Create mapping of indexes to components of new input + std::array componentMap = {-1}; + int index = 0; + for (unsigned i = 0; i < 3; ++i) { + if (compUsedCounts[i] > 0) + componentMap[i] = index++; + } - for (auto instSet : extractVec3) { - for (auto inst : instSet) { + BuilderBase builder(pipelineState->getContext()); + unsigned id = 0; + for (Instruction *&call : inputUsage->users) { + if (!call) + continue; + builder.SetInsertPoint(call); + if (usedCompCount == 1) { + // The processing of using one component + auto workgroupId1 = + static_cast(getInput(ShaderInput::WorkgroupId1, builder, *pipelineState->getLgcContext())); + getShaderInputUsage(shaderStage, ShaderInput::WorkgroupId1)->users.push_back(workgroupId1); + + for (auto [inst, index] : extractInstsForUsers[id]) { + assert(componentMap[index] == 0); inst->replaceAllUsesWith(workgroupId1); + inst->dropAllReferences(); inst->eraseFromParent(); } - } - } else if (usedCompCount == 2) { - // The processing of using two components - auto workgroupId2 = - static_cast(getInput(ShaderInput::WorkgroupId2, builder, *pipelineState->getLgcContext())); - getShaderInputUsage(shaderStage, ShaderInput::WorkgroupId2)->users.push_back(workgroupId2); - - Value *extractVec2[2] = {builder.CreateExtractElement(workgroupId2, static_cast(0)), - builder.CreateExtractElement(workgroupId2, 1)}; - unsigned index = 0; - for (auto instSet : extractVec3) { - for (auto inst : instSet) { - inst->replaceAllUsesWith(extractVec2[index]); + } else if (usedCompCount == 2) { + // The processing of using two components + auto workgroupId2 = + static_cast(getInput(ShaderInput::WorkgroupId2, builder, *pipelineState->getLgcContext())); + getShaderInputUsage(shaderStage, ShaderInput::WorkgroupId2)->users.push_back(workgroupId2); + + Value *extractVec2[2] = {builder.CreateExtractElement(workgroupId2, static_cast(0)), + builder.CreateExtractElement(workgroupId2, 1)}; + + for (auto [inst, index] : extractInstsForUsers[id]) { + assert(componentMap[index] >= 0); + inst->replaceAllUsesWith(extractVec2[componentMap[index]]); + inst->dropAllReferences(); inst->eraseFromParent(); } - if (!instSet.empty()) - ++index; + } else { + assert(usedCompCount == 0); } - } else { - assert(usedCompCount == 0); + ++id; } - for (auto call : workgroupIdCallInsts) + // Clear the original default <3xi32> workgroupId + inputUsage->users.clear(); + + for (auto call : workgroupIdCallInsts) { + assert(call->hasNUses(0)); + call->dropAllReferences(); call->eraseFromParent(); + } } diff --git a/lgc/patch/ShaderMerger.cpp b/lgc/patch/ShaderMerger.cpp index 14b49e5dc1..7bf1e63398 100644 --- a/lgc/patch/ShaderMerger.cpp +++ b/lgc/patch/ShaderMerger.cpp @@ -644,8 +644,9 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function entryPoint->addFnAttr("amdgpu-flat-work-group-size", "128,128"); // Force s_barrier to be present (ignore optimization) - const unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Geometry); - entryPoint->addFnAttr("target-features", ",+wavefrontsize" + std::to_string(waveSize)); // Set wavefront size + // NOTE: Legacy (non-NGG) HW path for GS doesn't support wave32 mode. + assert(m_pipelineState->getShaderWaveSize(ShaderStage::Geometry) == 64); + entryPoint->addFnAttr("target-features", ",+wavefrontsize64"); applyTuningAttributes(entryPoint, tuningAttrs); for (auto &arg : entryPoint->args()) { @@ -700,10 +701,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function auto threadIdInWave = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {builder.getInt32(-1), builder.getInt32(0)}); - - if (waveSize == 64) { - threadIdInWave = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {builder.getInt32(-1), threadIdInWave}); - } + threadIdInWave = builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {builder.getInt32(-1), threadIdInWave}); threadIdInWave->setName("threadIdInWave"); auto esVertCount = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, {builder.getInt32Ty()}, @@ -719,8 +717,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function {mergedWaveInfo, builder.getInt32(24), builder.getInt32(4)}); waveInSubgroup->setName("waveInSubgroup"); - unsigned esGsBytesPerWave = waveSize * 4 * calcFactor.esGsRingItemSize; - auto esGsOffset = builder.CreateMul(waveInSubgroup, builder.getInt32(esGsBytesPerWave)); + auto esGsOffset = builder.CreateMul(waveInSubgroup, builder.getInt32(64 * calcFactor.esGsRingItemSize)); auto validEsVert = builder.CreateICmpULT(threadIdInWave, esVertCount, "validEsVert"); builder.CreateCondBr(validEsVert, beginEsBlock, endEsBlock); diff --git a/lgc/patch/SystemValues.cpp b/lgc/patch/SystemValues.cpp index 1f8396c9f3..694eb1b0dc 100644 --- a/lgc/patch/SystemValues.cpp +++ b/lgc/patch/SystemValues.cpp @@ -270,14 +270,14 @@ Value *ShaderSystemValues::getGsVsRingBufDesc(unsigned streamId) { // Geometry shader, using GS-VS ring for output. Value *desc = loadDescFromDriverTable(SiDrvTableGsRingOuT0Offs + streamId * 4, builder); - unsigned outLocStart = 0; + unsigned streamItemOffset = 0; for (int i = 0; i < streamId; ++i) - outLocStart += resUsage->inOutUsage.gs.outLocCount[i]; + streamItemOffset += resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[i] * + m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices; // streamSize[streamId] = outLocCount[streamId] * 4 * sizeof(unsigned) // streamOffset = (streamSize[0] + ... + streamSize[streamId - 1]) * 64 * outputVertices - unsigned baseAddr = outLocStart * m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices * - sizeof(unsigned) * 4 * 64; + unsigned baseAddr = streamItemOffset * 4 * 64; // Patch GS-VS ring buffer descriptor base address for GS output Value *gsVsOutRingBufDescElem0 = builder.CreateExtractElement(desc, (uint64_t)0); @@ -295,7 +295,7 @@ Value *ShaderSystemValues::getGsVsRingBufDesc(unsigned streamId) { // Calculate and set stride in SRD dword1 unsigned gsVsStride = m_pipelineState->getShaderModes()->getGeometryShaderMode().outputVertices * - resUsage->inOutUsage.gs.outLocCount[streamId] * sizeof(unsigned) * 4; + resUsage->inOutUsage.gs.calcFactor.gsVsVertexItemSize[streamId] * 4; SqBufRsrcWord1 strideSetValue = {}; strideSetValue.bits.stride = gsVsStride; diff --git a/lgc/patch/WorkaroundDsSubdwordWrite.cpp b/lgc/patch/WorkaroundDsSubdwordWrite.cpp new file mode 100644 index 0000000000..08054463e3 --- /dev/null +++ b/lgc/patch/WorkaroundDsSubdwordWrite.cpp @@ -0,0 +1,101 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file WorkaroundDsSubdwordWrite.cpp + * @brief LLPC source file: contains implementation of class lgc::WorkaroundDsSubdwordWrite. + *********************************************************************************************************************** + */ + +#include "lgc/patch/WorkaroundDsSubdwordWrite.h" +#include "lgc/builder/BuilderImpl.h" +#include "lgc/state/PipelineShaders.h" +#include "lgc/state/PipelineState.h" +#include "lgc/state/TargetInfo.h" +#include "lgc/util/BuilderBase.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lgc-workaround-ds-subdword-write" + +using namespace lgc; +using namespace llvm; + +static cl::opt WorkaroundSubdwordWrite("workaround-subdword-write", + cl::desc("Waterfall loop around ds_write of subdword size"), + cl::init(false)); + +namespace lgc { + +// ===================================================================================================================== +// Executes the WorkaroundDsSubdwordWrite LLVM pass on the specified LLVM function. +// +// @param [in/out] module : LLVM module to be run on +// @param [in/out] analysisManager : Analysis manager to use for this transformation +// @returns : The preserved analyses (The analyses that are still valid after this pass) +PreservedAnalyses WorkaroundDsSubdwordWrite::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass WorkaroundDsSubdwordWrite\n"); + PipelineState *pipelineState = analysisManager.getResult(module).getPipelineState(); + bool workaroundSubdwordWrite = 0; + if (WorkaroundSubdwordWrite.getNumOccurrences()) + workaroundSubdwordWrite = WorkaroundSubdwordWrite.getValue(); + auto gfxIp = pipelineState->getTargetInfo().getGfxIpVersion(); + if (!workaroundSubdwordWrite || gfxIp.major != 11 || gfxIp.minor != 5) + return PreservedAnalyses::all(); +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892 + // Old version of the code + return PreservedAnalyses::all(); +#else + // New version of the code (also handles unknown version, which we treat as + // latest) + bool isChanged = false; + for (Function &func : module.getFunctionList()) { + for (BasicBlock &block : func) { + for (Instruction &inst : block) { + StoreInst *SI = dyn_cast(&inst); + if (!SI) + continue; + if (SI->getPointerAddressSpace() != ADDR_SPACE_LOCAL) + continue; + if (SI->getValueOperand()->getType()->getScalarSizeInBits() >= 32) + continue; + LLVM_DEBUG(dbgs() << "Inserting waterfall loop workaround for sub-dword store to DS memory:\n"); + LLVM_DEBUG(dbgs() << SI); + LLVM_DEBUG(dbgs() << "\n"); + BuilderImpl builderImpl(pipelineState); + builderImpl.createWaterfallLoop(SI, /*ptr must be uniform*/ 1, false, /*useVgprForOperands*/ true, ""); + isChanged = true; + } + } + } + return isChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); +#endif +} +} // namespace lgc diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp index 770167b544..b7a85906ee 100644 --- a/lgc/state/PipelineState.cpp +++ b/lgc/state/PipelineState.cpp @@ -1572,7 +1572,7 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) { // Per programming guide, it's recommended to use wave64 for fragment shader. waveSize = 64; } else if (hasShaderStage(ShaderStage::Geometry)) { - // Legacy (non-NGG) hardware path for GS does not support wave32. + // Legacy (non-NGG) HW path for GS does not support wave32 mode. waveSize = 64; if (getTargetInfo().getGfxIpVersion().major >= 11) waveSize = 32; @@ -1588,9 +1588,15 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) { waveSize = 64; unsigned waveSizeOption = getShaderOptions(checkingStage).waveSize; - if (waveSizeOption != 0) + if (waveSizeOption != 0) { waveSize = waveSizeOption; + if (checkingStage == ShaderStage::Geometry && getTargetInfo().getGfxIpVersion().major == 10) { + // Legacy (non-GS) HW path for GS does not support wave32 mode. Ignore the settings. + waveSize = 64; + } + } + // Note: the conditions below override the tuning option. // If workgroup size is not larger than 32, use wave size 32. if (checkingStage == ShaderStage::Mesh || checkingStage == ShaderStage::Task || @@ -2112,7 +2118,6 @@ void PipelineState::setXfbStateMetadata(Module *module) { xfbStrides[xfbBuffer] = cast(metaOp->getValue())->getZExtValue(); m_xfbStateMetadata.enableXfb = true; } - m_xfbStateMetadata.enablePrimStats = !m_xfbStateMetadata.enableXfb; } } } diff --git a/lgc/state/TargetInfo.cpp b/lgc/state/TargetInfo.cpp index 56b46f36ff..73f0a1a59e 100644 --- a/lgc/state/TargetInfo.cpp +++ b/lgc/state/TargetInfo.cpp @@ -325,6 +325,30 @@ static void setGfx1103Info(TargetInfo *targetInfo) { } #endif +#if LLPC_BUILD_STRIX1 +// gfx1150 +// +// @param [in/out] targetInfo : Target info +static void setGfx1150Info(TargetInfo *targetInfo) { + setGfx11Info(targetInfo); + + targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1; + + targetInfo->getGpuProperty().numShaderEngines = 1; +} + +// gfx115F +// +// @param [in/out] targetInfo : Target info +static void setGfx115FInfo(TargetInfo *targetInfo) { + setGfx11Info(targetInfo); + + targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1; + + targetInfo->getGpuProperty().numShaderEngines = 1; +} +#endif + // ===================================================================================================================== // Set TargetInfo. Returns false if the GPU name is not found or not supported. // @@ -358,6 +382,10 @@ bool TargetInfo::setTargetInfo(StringRef gpuName) { {"gfx1102", &setGfx1102Info}, // gfx1102, navi33 #if LLPC_BUILD_PHOENIX1 || LLPC_BUILD_PHOENIX2 {"gfx1103", &setGfx1103Info}, // gfx1103, phoenix1 +#endif +#if LLPC_BUILD_STRIX1 + {"gfx1150", &setGfx1150Info}, // gfx1150, strix + {"gfx115F", &setGfx115FInfo}, // gfx115F, strix A0 #endif }; diff --git a/lgc/test/ImageSampleNoReturn.lgc b/lgc/test/ImageSampleNoReturn.lgc new file mode 100644 index 0000000000..06cfae3042 --- /dev/null +++ b/lgc/test/ImageSampleNoReturn.lgc @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc +; RUN: lgc -o - --mcpu=gfx1100 --emit-llvm %s | FileCheck -check-prefixes=CHECK %s + +define dllexport void @lgc.shader.CS() #0 !lgc.shaderstage !0 { +entry: + %rsrc = call <8 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v8i32(i32 1, i32 1, i32 0, i32 1) + %sampler = call <4 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v4i32(i32 2, i32 2, i32 0, i32 2) + %addr.tmp = insertelement <2 x float> undef, float 0.000000e+00, i64 0 + %addr = insertelement <2 x float> %addr.tmp, float 0.000000e+00, i64 1 + call void @lgc.create.image.sample(i32 1, i32 0, <8 x i32> addrspace(4)* %rsrc, <4 x i32> addrspace(4)* %sampler, i32 1, <2 x float> %addr) + ret void +} + +; Function Attrs: nounwind readnone +declare <8 x i32> addrspace(4)* @lgc.create.get.desc.ptr.p4v8i32(...) #1 + +; Function Attrs: nounwind readnone +declare <4 x i32> addrspace(4)* @lgc.create.get.desc.ptr.p4v4i32(...) #1 + +; Function Attrs: nounwind +declare void @lgc.create.image.sample(...) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!lgc.user.data.nodes = !{!1, !2, !3} +!lgc.color.export.formats = !{!4} + +!0 = !{i32 7} +!1 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 11, i32 1, i32 1} +!2 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 80, i32 0, i32 1, i32 8} +!3 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, i32 12288, i32 117436416, i32 1750073344, i32 -2147483648} +!4 = !{i32 10, i32 0, i32 0, i32 0, i32 15} +; CHECK-LABEL: @_amdgpu_cs_main( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], -4294967296 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[USERDATA11:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP4]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP4]], i32 -1) ] +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP4]], align 4, !invariant.load !3 +; CHECK-NEXT: call void @llvm.amdgcn.image.sample.2d.nortn.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> [[TMP5]], <4 x i32> , i1 false, i32 0, i32 0) +; CHECK-NEXT: ret void +; diff --git a/lgc/test/Transforms/Continufy/simple.lgc b/lgc/test/Transforms/Continufy/simple.lgc index 9149ce9784..d03d75ebd0 100644 --- a/lgc/test/Transforms/Continufy/simple.lgc +++ b/lgc/test/Transforms/Continufy/simple.lgc @@ -14,7 +14,8 @@ define spir_func void @raygen() !lgc.shaderstage !{i32 7} !continufy.stage !{i32 ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 [[TMP2]], i32 8, i32 poison, i32 [[X]], ptr addrspace(1) [[DST]]) ; CHECK-NEXT: store [2 x i32] [[TMP3]], ptr addrspace(1) [[DST]], align 4 -; CHECK-NEXT: ret void +; CHECK-NEXT: call void @lgc.cps.complete() +; CHECK-NEXT: unreachable ; %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0) %fn = load ptr, ptr addrspace(4) %pushconst @@ -60,7 +61,8 @@ define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{i32 7} { ; CHECK-NEXT: call void (...) @lgc.cps.await__isVoid(i32 [[TMP1]], i32 2, i32 poison) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: ret void +; CHECK-NEXT: call void @lgc.cps.complete() +; CHECK-NEXT: unreachable ; entry: %id = call i32 @lgc.shader.input.LocalInvocationId(i32 49) diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc new file mode 100644 index 0000000000..97b624e86e --- /dev/null +++ b/lgc/test/Transforms/LowerCooperativeMatrix/bf16muladd.lgc @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5 +; RUN: lgc -o - --mcpu=gfx1100 -passes=lgc-lower-cooperative-matrix %s | FileCheck --check-prefixes=CHECK %s + +define <8 x i32> @muladd_bf16_bf16(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c) { +; CHECK-LABEL: define <8 x i32> @muladd_bf16_bf16( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[A]] to <16 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[B]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[C]] to <16 x i16> +; CHECK-NEXT: [[VALUE1:%.*]] = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i16> [[TMP3]], i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[VALUE1]] to <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; + %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false, i1 false, i32 7, i32 7) + ret <8 x i32> %value +} + +define <8 x float> @muladd_bf16_f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c) { +; CHECK-LABEL: define <8 x float> @muladd_bf16_f32( +; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x float> [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[A]] to <16 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[B]] to <16 x i16> +; CHECK-NEXT: [[VALUE1:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <8 x float> [[C]]) +; CHECK-NEXT: ret <8 x float> [[VALUE1]] +; + %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x i32> %a, <8 x i32> %b, <8 x float> %c, i1 false, i1 false, i1 false, i1 false, i32 2, i32 7) + ret <8 x float> %value +} + +declare <8 x i32> @lgc.cooperative.matrix.muladd__v8i32(...) +declare <8 x float> @lgc.cooperative.matrix.muladd__v8f32(...) diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc index 146fa69acf..943299a9bc 100644 --- a/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc +++ b/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc @@ -70,10 +70,27 @@ define <8 x float> @convert_f16_to_factor(<8 x float> %accum) { ret <8 x float> %fact } +define <8 x i32> @convert_f16_to_bf16(<8 x float> %acc) { +; CHECK-LABEL: @convert_f16_to_bf16( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x float> [[ACC:%.*]] to <16 x half> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x half> [[TMP2]], <16 x half> poison, <8 x i32> +; CHECK-NEXT: [[CONVERT16TOFLOAT32:%.*]] = fpext <8 x half> [[TMP3]] to <8 x float> +; CHECK-NEXT: [[TMP4:%.*]] = fptrunc <8 x float> [[CONVERT16TOFLOAT32]] to <8 x bfloat> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <8 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i16> [[TMP6]] to <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP7]] +; + %fConvert = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 45, <8 x float> %acc, i32 1, i32 7, i32 1, i32 1) + ret <8 x i32> %fConvert +} + declare i1 @getcc() declare <8 x float> @process1(<8 x float>) declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...) declare <8 x float> @lgc.cooperative.matrix.transpose__v8f32(...) declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...) +declare <8 x i32> @lgc.cooperative.matrix.convert__v8i32(...) declare void @lgc.cooperative.matrix.store(...) diff --git a/lgc/test/Transforms/LowerDebugPrintf/basic.lgc b/lgc/test/Transforms/LowerDebugPrintf/basic.lgc index 312478af57..817cb273b0 100644 --- a/lgc/test/Transforms/LowerDebugPrintf/basic.lgc +++ b/lgc/test/Transforms/LowerDebugPrintf/basic.lgc @@ -23,6 +23,7 @@ define spir_func void @simple() !lgc.shaderstage !0 { ; PALMD-NEXT: .argument_count: 1 ; PALMD-NEXT: .index: ; PALMD-NEXT: .string: 'Test: %u' +; PALMD-NEXT: .user_data_offset: 0 ; PALMD-NEXT: .version: 1 declare void @lgc.debug.printf(...) diff --git a/lgc/test/Transforms/LowerGpuRt/init-static-id-op.lgc b/lgc/test/Transforms/LowerGpuRt/init-static-id-op.lgc new file mode 100644 index 0000000000..eeb886c9c0 --- /dev/null +++ b/lgc/test/Transforms/LowerGpuRt/init-static-id-op.lgc @@ -0,0 +1,20 @@ +; RUN: lgc -mcpu=gfx1030 -o - -passes=lgc-lower-gpurt %s | FileCheck --check-prefixes=CHECK %s + +; Test that each call site of @lgc.gpurt.init.static.id generates a unique ID. + +; ModuleID = 'lgcPipeline' +declare i32 @lgc.gpurt.init.static.id() +declare void @dummy.use(i32 %val) + +define spir_func void @test() { +; CHECK: call void @dummy.use(i32 [[#%d,ID1:]]) +; CHECK-NEXT: call void @dummy.use(i32 +; CHECK-NOT: [[#ID1]]) +; CHECK-NEXT: ret void +; + %1 = call i32 @lgc.gpurt.init.static.id() + call void @dummy.use(i32 %1) + %2 = call i32 @lgc.gpurt.init.static.id() + call void @dummy.use(i32 %2) + ret void +} diff --git a/lgc/test/WorkgroupIdOpt.lgc b/lgc/test/WorkgroupIdOpt.lgc new file mode 100644 index 0000000000..43d0088202 --- /dev/null +++ b/lgc/test/WorkgroupIdOpt.lgc @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 5 +; RUN: lgc -mcpu=gfx1100 -passes=lgc-patch-entry-point-mutate -o - %s | FileCheck --check-prefixes=CHECK %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p32:32:32" +target triple = "amdgcn--amdpal" + +; Function Attrs: nounwind +define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !spirv.ExecutionModel !3 !lgc.shaderstage !4 { +; CHECK-LABEL: define dllexport amdgpu_cs void @lgc.shader.CS.main( +; CHECK-SAME: i32 inreg noundef [[GLOBALTABLE:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[DUMMYINIT2:%.*]], i32 inreg noundef [[DUMMYINIT3:%.*]], i32 inreg noundef [[DUMMYINIT4:%.*]], i32 inreg noundef [[DUMMYINIT5:%.*]], i32 inreg noundef [[DUMMYINIT6:%.*]], i32 inreg noundef [[DUMMYINIT7:%.*]], i32 inreg noundef [[DUMMYINIT8:%.*]], i32 inreg noundef [[DUMMYINIT9:%.*]], i32 inreg noundef [[DUMMYINIT10:%.*]], i32 inreg noundef [[DUMMYINIT11:%.*]], i32 inreg noundef [[DUMMYINIT12:%.*]], i32 inreg noundef [[DUMMYINIT13:%.*]], i32 inreg noundef [[DUMMYINIT14:%.*]], i32 inreg noundef [[DUMMYINIT15:%.*]], <2 x i32> inreg noundef [[WORKGROUPID2:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META5:![0-9]+]] !lgc.shaderstage [[META6:![0-9]+]] { +; CHECK-NEXT: [[_ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[USERDATA0]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) [[TMP4]], i32 4), "dereferenceable"(ptr addrspace(4) [[TMP4]], i32 -1) ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) [[TMP5]], i1 false, i1 false) +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[LOCALINVOCATIONID]], 1023 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <3 x i32> poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[LOCALINVOCATIONID]], 10 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 1023 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <3 x i32> [[TMP8]], i32 [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP9]], 10 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <3 x i32> [[TMP11]], i32 [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <3 x i32> [[TMP13]], i32 0, i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = call <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32> [[TMP14]], i32 0) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x i32> [[TMP15]], i64 2 +; CHECK-NEXT: [[TMP17:%.*]] = mul i32 3, [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <3 x i32> [[TMP15]], i64 1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = mul i32 2, [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i32> [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15 +; CHECK-NEXT: [[TMP24:%.*]] = lshr i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP22]], 16 +; CHECK-NEXT: br i1 [[TMP25]], label %[[IF0_THEN:.*]], label %[[IF0_ELSE:.*]] +; CHECK: [[IF0_THEN]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[WORKGROUPID2]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[WORKGROUPID2]], i64 1 +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr <{ [32 x i32] }>, ptr addrspace(7) [[TMP6]], i32 0, i32 0, i32 [[TMP22]] +; CHECK-NEXT: store i32 [[TMP28]], ptr addrspace(7) [[TMP29]], align 4 +; CHECK-NEXT: br label %[[IF0_END:.*]] +; CHECK: [[IF0_ELSE]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[WORKGROUPID2]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[WORKGROUPID2]], i64 1 +; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr <{ [32 x i32] }>, ptr addrspace(7) [[TMP6]], i32 0, i32 0, i32 [[TMP22]] +; CHECK-NEXT: store i32 [[TMP32]], ptr addrspace(7) [[TMP33]], align 4 +; CHECK-NEXT: br label %[[IF0_END]] +; CHECK: [[IF0_END]]: +; CHECK-NEXT: [[WORKGROUPID1:%.*]] = extractelement <2 x i32> [[WORKGROUPID2]], i64 0 +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[WORKGROUPID2]], i64 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[IF1_THEN:.*]], label %[[IF1_ELSE:.*]] +; CHECK: [[IF1_THEN]]: +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP22]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr <{ [32 x i32] }>, ptr addrspace(7) [[TMP6]], i32 0, i32 0, i32 [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[WORKGROUPID1]], 5 +; CHECK-NEXT: store i32 [[TMP36]], ptr addrspace(7) [[TMP35]], align 4 +; CHECK-NEXT: br label %[[IF1_END:.*]] +; CHECK: [[IF1_ELSE]]: +; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP22]], 2 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr <{ [32 x i32] }>, ptr addrspace(7) [[TMP6]], i32 0, i32 0, i32 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[WORKGROUPID1]], 10 +; CHECK-NEXT: store i32 [[TMP39]], ptr addrspace(7) [[TMP38]], align 4 +; CHECK-NEXT: br label %[[IF1_END]] +; CHECK: [[IF1_END]]: +; CHECK-NEXT: ret void +; +.entry: + %0 = call i64 @llvm.amdgcn.s.getpc() + %1 = bitcast i64 %0 to <2 x i32> + %2 = call i32 @lgc.load.user.data__i32(i32 0) + %3 = insertelement <2 x i32> %1, i32 %2, i64 0 + %4 = bitcast <2 x i32> %3 to i64 + %5 = inttoptr i64 %4 to ptr addrspace(4) + call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %5, i32 4), "dereferenceable"(ptr addrspace(4) %5, i32 -1) ] + %6 = getelementptr i8, ptr addrspace(4) %5, i32 0 + %7 = call ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4) %6, i1 false, i1 false) + %8 = call i32 @lgc.shader.input.LocalInvocationId(i32 50) #2 + %9 = and i32 %8, 1023 + %10 = insertelement <3 x i32> poison, i32 %9, i64 0 + %11 = lshr i32 %8, 10 + %12 = and i32 %11, 1023 + %13 = insertelement <3 x i32> %10, i32 %12, i64 1 + %14 = lshr i32 %11, 10 + %15 = insertelement <3 x i32> %13, i32 %14, i64 2 + %16 = insertelement <3 x i32> %15, i32 0, i64 2 + %17 = call <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32> %16, i32 0) #2 + %18 = extractelement <3 x i32> %17, i64 2 + %19 = mul i32 3, %18 + %20 = extractelement <3 x i32> %17, i64 1 + %21 = add i32 %19, %20 + %22 = mul i32 2, %21 + %23 = extractelement <3 x i32> %17, i64 0 + %24 = add i32 %22, %23 + %25 = and i32 %24, 15 + %26 = lshr i32 %24, 4 + %27 = icmp ult i32 %24, 16 + br i1 %27, label %if0.then, label %if0.else + +if0.then: ; preds = %.entry + %28 = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #2 + %29 = extractelement <3 x i32> %28, i64 0 + %30 = extractelement <3 x i32> %28, i64 1 + %31 = add i32 %29, %30 + %32 = getelementptr <{ [32 x i32] }>, ptr addrspace(7) %7, i32 0, i32 0, i32 %24 + store i32 %31, ptr addrspace(7) %32, align 4 + br label %if0.end + +if0.else: ; preds = %.entry + %33 = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #2 + %34 = extractelement <3 x i32> %33, i64 0 + %35 = extractelement <3 x i32> %33, i64 1 + %36 = add i32 %34, %35 + %37 = getelementptr <{ [32 x i32] }>, ptr addrspace(7) %7, i32 0, i32 0, i32 %24 + store i32 %36, ptr addrspace(7) %37, align 4 + br label %if0.end + +if0.end: ; preds = %if0.else, %if0.then + %38 = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #2 + br i1 %27, label %if1.then, label %if1.else + +if1.then: ; preds = %if0.end + %40 = extractelement <3 x i32> %38, i64 0 + %41 = add i32 %24, 1 + %42 = getelementptr <{ [32 x i32] }>, ptr addrspace(7) %7, i32 0, i32 0, i32 %41 + %43 = add i32 %40, 5 + store i32 %43, ptr addrspace(7) %42, align 4 + br label %if1.end + +if1.else: ; preds = %if0.end + %44 = extractelement <3 x i32> %38, i64 0 + %45 = add i32 %24, 2 + %46 = getelementptr <{ [32 x i32] }>, ptr addrspace(7) %7, i32 0, i32 0, i32 %45 + %47 = add i32 %44, 10 + store i32 %47, ptr addrspace(7) %46, align 4 + br label %if1.end + +if1.end: ; preds = %if1.else, %if1.then + ret void +} + +; Function Attrs: nounwind willreturn memory(none) +declare ptr addrspace(7) @lgc.load.buffer.desc(i64, i32, i32, i32) local_unnamed_addr #1 + +; Function Attrs: nounwind memory(none) +declare i32 @lgc.shader.input.LocalInvocationId(i32) #2 + +; Function Attrs: nounwind memory(none) +declare <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32>, i32) #2 + +; Function Attrs: nounwind memory(none) +declare <3 x i32> @lgc.shader.input.WorkgroupId(i32) #2 + +; Function Attrs: nounwind willreturn memory(none) +declare i32 @lgc.load.user.data__i32(i32) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i64 @llvm.amdgcn.s.getpc() #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #4 + +; Function Attrs: nounwind willreturn memory(none) +declare ptr addrspace(7) @lgc.buffer.load.desc.to.ptr(ptr addrspace(4), i1, i1) #1 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize32" } +attributes #1 = { nounwind willreturn memory(none) } +attributes #2 = { nounwind memory(none) } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } + +!llpc.compute.mode = !{!0} +!lgc.user.data.nodes = !{!1, !2} + +!0 = !{i32 2, i32 3, i32 1} +!1 = !{!"DescriptorTableVaPtr", i32 7, i32 128, i32 0, i32 1, i32 1} +!2 = !{!"DescriptorBuffer", i32 6, i32 128, i32 0, i32 4, i64 0, i32 0, i32 4} +!3 = !{i32 5} +!4 = !{i32 7} +;. +; CHECK: [[META5]] = !{i32 5} +; CHECK: [[META6]] = !{i32 7} +;. diff --git a/lgc/test/shaderdb/gfx1150_ds_subdword_workaround.lgc b/lgc/test/shaderdb/gfx1150_ds_subdword_workaround.lgc new file mode 100644 index 0000000000..fc77a85fdb --- /dev/null +++ b/lgc/test/shaderdb/gfx1150_ds_subdword_workaround.lgc @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc +; RUN: lgc -o - --mcpu=gfx1100 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s +; RUN: lgc -o - --mcpu=gfx1150 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s +; RUN: lgc -o - -workaround-subdword-write --mcpu=gfx1150 -filetype=asm %s | FileCheck -check-prefixes=WAR %s + +define dllexport spir_func void @store32(i32 %value, i32 %index) !lgc.shaderstage !0 { +.entry: + %gep = getelementptr [16384 x i32], ptr addrspace(3) @Lds, i32 0, i32 %index + store i32 %value, ptr addrspace(3) %gep, align 4 + ret void +} + +define dllexport spir_func void @store16(i16 %value, i32 %index) !lgc.shaderstage !0 { +.entry: + %gep = getelementptr [16384 x i32], ptr addrspace(3) @Lds, i32 0, i32 %index + store i16 %value, ptr addrspace(3) %gep, align 2 + ret void +} + +define dllexport spir_func void @store8(i8 %value, i32 %index) !lgc.shaderstage !0 { +.entry: + %gep = getelementptr [16384 x i32], ptr addrspace(3) @Lds, i32 0, i32 %index + store i8 %value, ptr addrspace(3) %gep, align 1 + ret void +} + +@Lds = external addrspace(3) global [16384 x i32], align 4 + +attributes #0 = { nounwind } + +; ShaderStage::Compute +!0 = !{i32 7} + +; Setting Threadgroup Dimensions to 64 x 1 x 1 +!llpc.compute.mode = !{!1} +!1 = !{i32 64, i32 1, i32 1} +; REQUIRES: do-not-run-me + +; CHECK-LABEL: amdgpu_cs_main: +; CHECK: v_lshlrev_b32_e32 v1, 2, v1 +; CHECK-NEXT: ds_store_b32 v1, v0 +; CHECK-NEXT: s_endpgm +; +; CHECK-LABEL: amdgpu_cs_main.1: +; CHECK: v_lshlrev_b32_e32 v1, 2, v1 +; CHECK-NEXT: ds_store_b16 v1, v0 +; CHECK-NEXT: s_endpgm +; +; CHECK-LABEL: amdgpu_cs_main.2: +; CHECK: v_lshlrev_b32_e32 v1, 2, v1 +; CHECK-NEXT: ds_store_b8 v1, v0 +; CHECK-NEXT: s_endpgm +; +; WAR-LABEL: amdgpu_cs_main: +; WAR: v_lshlrev_b32_e32 v1, 2, v1 +; WAR-NEXT: ds_store_b32 v1, v0 +; WAR-NEXT: s_endpgm +; +; WAR-LABEL: amdgpu_cs_main.1: +; WAR: v_lshlrev_b32_e32 v1, 2, v1 +; WAR-NEXT: s_mov_b64 s[0:1], exec +; WAR-NEXT: .LBB1_1: +; WAR-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; WAR-NEXT: v_readfirstlane_b32 s0, v1 +; WAR-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v1 +; WAR-NEXT: s_delay_alu instid0(VALU_DEP_1) +; WAR-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; WAR-NEXT: ds_store_b16 v1, v0 +; WAR-NEXT: s_xor_b64 exec, exec, s[0:1] +; WAR-NEXT: s_cbranch_execnz .LBB1_1 +; WAR-NEXT: s_endpgm +; +; WAR-LABEL: amdgpu_cs_main.2: +; WAR: v_lshlrev_b32_e32 v1, 2, v1 +; WAR-NEXT: s_mov_b64 s[0:1], exec +; WAR-NEXT: .LBB2_1: +; WAR-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; WAR-NEXT: v_readfirstlane_b32 s0, v1 +; WAR-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v1 +; WAR-NEXT: s_delay_alu instid0(VALU_DEP_1) +; WAR-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; WAR-NEXT: ds_store_b8 v1, v0 +; WAR-NEXT: s_xor_b64 exec, exec, s[0:1] +; WAR-NEXT: s_cbranch_execnz .LBB2_1 +; WAR-NEXT: s_endpgm diff --git a/lgc/tool/lgc/lgc.cpp b/lgc/tool/lgc/lgc.cpp index 357bd39224..e42ec5447a 100644 --- a/lgc/tool/lgc/lgc.cpp +++ b/lgc/tool/lgc/lgc.cpp @@ -33,6 +33,7 @@ #include "lgc/LgcContext.h" #include "lgc/LgcCpsDialect.h" #include "lgc/LgcDialect.h" +#include "lgc/LgcIlCpsDialect.h" #include "lgc/PassManager.h" #include "lgc/Pipeline.h" #include "lgc/patch/Patch.h" @@ -199,7 +200,8 @@ int main(int argc, char **argv) { LgcContext::initialize(); LLVMContext context; - auto dialectContext = llvm_dialects::DialectContext::make(context); + auto dialectContext = + llvm_dialects::DialectContext::make(context); // Set our category on options that we want to show in -help, and hide other options. auto opts = cl::getRegisteredOptions(); diff --git a/lgc/util/WorkgroupLayout.cpp b/lgc/util/WorkgroupLayout.cpp new file mode 100644 index 0000000000..dbbc7c725a --- /dev/null +++ b/lgc/util/WorkgroupLayout.cpp @@ -0,0 +1,211 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file WorkgroupLayout.cpp +* @brief LLPC source file: Implementation of swizzle workgroup layout +*********************************************************************************************************************** +*/ +#include "lgc/util/WorkgroupLayout.h" + +using namespace lgc; +using namespace llvm; + +// ===================================================================================================================== +// Do automatic workgroup size reconfiguration in a compute shader, to allow ReconfigWorkgroupLayout +// to apply optimizations. +// +// @param shaderStage : Shader stage +SwizzleWorkgroupLayout lgc::calculateWorkgroupLayout(PipelineState *pipelineState, ShaderStageEnum shaderStage) { + unsigned workgroupSizeX = 0; + unsigned workgroupSizeY = 0; + SwizzleWorkgroupLayout resultLayout = {WorkgroupLayout::Unknown, WorkgroupLayout::Unknown}; + DerivativeMode derivativeMode = DerivativeMode::Linear; + + if (shaderStage != ShaderStage::Compute && shaderStage != ShaderStage::Task && shaderStage != ShaderStage::Mesh) { + return resultLayout; + } + + if (shaderStage == ShaderStage::Compute) { + const ResourceUsage *resUsage = pipelineState->getShaderResourceUsage(ShaderStage::Compute); + if (resUsage->builtInUsage.cs.foldWorkgroupXY) { + llvm_unreachable("Should never be called!"); + } + auto &mode = pipelineState->getShaderModes()->getComputeShaderMode(); + workgroupSizeX = mode.workgroupSizeX; + workgroupSizeY = mode.workgroupSizeY; + derivativeMode = mode.derivativeMode; + } + + if (shaderStage == ShaderStage::Task) { + const ResourceUsage *resUsage = pipelineState->getShaderResourceUsage(ShaderStage::Task); + if (resUsage->builtInUsage.task.foldWorkgroupXY) { + llvm_unreachable("Should never be called!"); + } + auto &mode = pipelineState->getShaderModes()->getComputeShaderMode(); + workgroupSizeX = mode.workgroupSizeX; + workgroupSizeY = mode.workgroupSizeY; + derivativeMode = mode.derivativeMode; + } + + if (shaderStage == ShaderStage::Mesh) { + const ResourceUsage *resUsage = pipelineState->getShaderResourceUsage(ShaderStage::Mesh); + if (resUsage->builtInUsage.mesh.foldWorkgroupXY) { + llvm_unreachable("Should never be called!"); + } + auto &mode = pipelineState->getShaderModes()->getMeshShaderMode(); + workgroupSizeX = mode.workgroupSizeX; + workgroupSizeY = mode.workgroupSizeY; + derivativeMode = mode.derivativeMode; + } + + if (derivativeMode == DerivativeMode::Quads) { + resultLayout.microLayout = WorkgroupLayout::Quads; + } else if (derivativeMode == DerivativeMode::Linear) { + resultLayout.microLayout = WorkgroupLayout::Linear; + } + + if (pipelineState->getOptions().forceCsThreadIdSwizzling) { + if ((workgroupSizeX >= 16) && (workgroupSizeX % 8 == 0) && (workgroupSizeY % 4 == 0)) { + resultLayout.macroLayout = WorkgroupLayout::SexagintiQuads; + } + } + + // If no configuration has been specified, apply a reconfigure if the compute shader uses images and the + // pipeline option was enabled. + if (pipelineState->getOptions().reconfigWorkgroupLayout) { + if ((workgroupSizeX % 2) == 0 && (workgroupSizeY % 2) == 0) { + if (workgroupSizeX % 8 == 0) { + // It can be reconfigured into 8 X N + if (resultLayout.macroLayout == WorkgroupLayout::Unknown) { + resultLayout.macroLayout = WorkgroupLayout::SexagintiQuads; + } + } else { + // If our local size in the X & Y dimensions are multiples of 2, we can reconfigure. + if (resultLayout.microLayout == WorkgroupLayout::Unknown) { + resultLayout.microLayout = WorkgroupLayout::Quads; + } + } + } + } + + return resultLayout; +} + +// ===================================================================================================================== +// Reconfigure the workgroup for optimization purposes. +// @param localInvocationId : This is a v3i32 shader input (three VGPRs set up in hardware). +// @param pipelineState: pipeline state +// @param shaderStage : Shader stage +// @param macroLayout : Swizzle the thread id into macroLayout from macro level +// @param microLayout : Swizzle the thread id into microLayout from micro level +// @param workgroupSizeX : WorkgroupSize X for thread Id numbers +// @param workgroupSizeY : WorkgroupSize Y for thread Id numbers +// @param workgroupSizeZ : WorkgroupSize Z for thread Id numbers +// @param isHwLocalInvocationId : identify whether the localInvocationId is builtInLocalInvcocationId or +// BuiltInUnswizzledLocalInvocationId +// @param builder : the builder to use +Value *lgc::reconfigWorkgroupLayout(Value *localInvocationId, PipelineState *pipelineState, ShaderStageEnum shaderStage, + WorkgroupLayout macroLayout, WorkgroupLayout microLayout, unsigned workgroupSizeX, + unsigned workgroupSizeY, unsigned workgroupSizeZ, bool isHwLocalInvocationId, + BuilderBase &builder) { + Value *apiX = builder.getInt32(0); + Value *apiY = builder.getInt32(0); + Value *newLocalInvocationId = PoisonValue::get(localInvocationId->getType()); + unsigned bitsX = 0; + unsigned bitsY = 0; + ResourceUsage *resUsage = pipelineState->getShaderResourceUsage(shaderStage); + if (shaderStage == ShaderStage::Mesh) { + resUsage->builtInUsage.mesh.foldWorkgroupXY = true; + } else if (shaderStage == ShaderStage::Task) { + resUsage->builtInUsage.task.foldWorkgroupXY = true; + } else { + assert(shaderStage == ShaderStage::Compute); + resUsage->builtInUsage.cs.foldWorkgroupXY = true; + } + Value *tidXY = builder.CreateExtractElement(localInvocationId, builder.getInt32(0), "tidXY"); + Value *apiZ = builder.getInt32(0); + if (workgroupSizeZ > 1) { + apiZ = builder.CreateExtractElement(localInvocationId, builder.getInt32(1), "tidZ"); + } + // For BuiltInUnswizzledLocalInvocationId, it shouldn't swizzle and return the localInvocation without + // foldXY. + if (isHwLocalInvocationId) { + apiX = builder.CreateURem(tidXY, builder.getInt32(workgroupSizeX)); + apiY = builder.CreateUDiv(tidXY, builder.getInt32(workgroupSizeX)); + } else { + // Micro-tiling with quad:2x2, the thread-id will be marked as {<0,0>,<1,0>,<0,1>,<1,1>} + // for each quad. Each 4 threads will be wrapped in the same tid. + if (microLayout == WorkgroupLayout::Quads) { + apiX = builder.CreateAnd(tidXY, builder.getInt32(1)); + apiY = builder.CreateAnd(builder.CreateLShr(tidXY, builder.getInt32(1)), builder.getInt32(1)); + tidXY = builder.CreateLShr(tidXY, builder.getInt32(2)); + bitsX = 1; + bitsY = 1; + } + + // Macro-tiling with 8xN block + if (macroLayout == WorkgroupLayout::SexagintiQuads) { + unsigned bits = 3 - bitsX; + Value *subTileApiX = builder.CreateAnd(tidXY, builder.getInt32((1 << bits) - 1)); + subTileApiX = builder.CreateShl(subTileApiX, builder.getInt32(bitsX)); + apiX = builder.CreateOr(apiX, subTileApiX); + + // 1. Folding 4 threads as one tid if micro-tiling with quad before. + // After the folding, each 4 hwThreadIdX share the same tid after tid>>=bits. + // For example: hwThreadId.X = 0~3, the tid will be 0; will be {<0,0>,<1,0>,<0,1>,<1,1>} + // hwThreadId.X = 4~7, the tid will be 1; will be {<0,0>,<1,0>,<0,1>,<1,1>} + // 2. Folding 8 threads as one tid without any micro-tiling before. + // After the folding, each 8 hwThreadIdX share the same tid after tid>>=bits and only apiX are calculated. + // For example: hwThreadId.X = 0~7, tid = hwThreadId.X/8 = 0; will be {0,1,...,7} + // hwThreadId.X = 8~15, tid = hwThreadId.X/8 = 1; will be {0,1,...,7} + tidXY = builder.CreateLShr(tidXY, builder.getInt32(bits)); + bitsX = 3; + + // 1. Unfolding 4 threads, it needs to set walkY = workgroupSizeY/2 as these threads are wrapped in 2X2 size. + // 2. Unfolding 8 threads, it needs to set walkY = workgroupSizeY/2 as these threads are wrapped in 1x8 size. + // After unfolding these threads, it needs '| apiX and | apiY' to calculated each thread's coordinate + // in the unfolded wrap threads. + unsigned walkY = workgroupSizeY >> bitsY; + Value *tileApiY = builder.CreateShl(builder.CreateURem(tidXY, builder.getInt32(walkY)), builder.getInt32(bitsY)); + apiY = builder.CreateOr(apiY, tileApiY); + Value *tileApiX = builder.CreateShl(builder.CreateUDiv(tidXY, builder.getInt32(walkY)), builder.getInt32(bitsX)); + apiX = builder.CreateOr(apiX, tileApiX); + } else { + // Update the coordinates for each 4 wrap-threads then unfold each thread to calculate the coordinate by '| apiX + // and | apiY' + unsigned walkX = workgroupSizeX >> bitsX; + Value *tileApiX = builder.CreateShl(builder.CreateURem(tidXY, builder.getInt32(walkX)), builder.getInt32(bitsX)); + apiX = builder.CreateOr(apiX, tileApiX); + Value *tileApiY = builder.CreateShl(builder.CreateUDiv(tidXY, builder.getInt32(walkX)), builder.getInt32(bitsY)); + apiY = builder.CreateOr(apiY, tileApiY); + } + } + + newLocalInvocationId = builder.CreateInsertElement(newLocalInvocationId, apiX, uint64_t(0)); + newLocalInvocationId = builder.CreateInsertElement(newLocalInvocationId, apiY, uint64_t(1)); + newLocalInvocationId = builder.CreateInsertElement(newLocalInvocationId, apiZ, uint64_t(2)); + return newLocalInvocationId; +} diff --git a/llpc/CMakeLists.txt b/llpc/CMakeLists.txt index 41330609a9..748b948e98 100644 --- a/llpc/CMakeLists.txt +++ b/llpc/CMakeLists.txt @@ -61,6 +61,8 @@ if(ICD_BUILD_LLPC) set(LLVM_INCLUDE_TOOLS ON CACHE BOOL Force) set(LLVM_INCLUDE_UTILS ON CACHE BOOL Force) set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL Force) + set(LLVM_RAM_PER_TABLEGEN_JOB 10000 CACHE STRING Force) + set(LLVM_RAM_PER_LINK_JOB 5000 CACHE STRING Force) if (NOT WIN32) # Build optimized version of llvm-tblgen even in debug builds, for faster build times. #if _WIN32 @@ -215,22 +217,22 @@ if(ICD_BUILD_LLPC) # llpc/lower target_sources(llpcinternal PRIVATE lower/llpcSpirvLower.cpp - lower/llpcSpirvLowerAccessChain.cpp - lower/llpcSpirvLowerCfgMerges.cpp - lower/llpcSpirvLowerConstImmediateStore.cpp - lower/llpcSpirvLowerGlobal.cpp - lower/llpcSpirvLowerInstMetaRemove.cpp - lower/llpcSpirvLowerMath.cpp - lower/llpcSpirvLowerMemoryOp.cpp + lower/LowerAccessChain.cpp + lower/LowerCfgMerges.cpp + lower/LowerConstImmediateStore.cpp + lower/LowerGlobals.cpp + lower/LowerInstMetaRemove.cpp + lower/LowerMath.cpp + lower/LowerMemoryOp.cpp lower/LowerPostInline.cpp - lower/llpcSpirvLowerRayTracing.cpp - lower/llpcSpirvLowerTerminator.cpp - lower/llpcSpirvLowerTranslator.cpp + lower/LowerRayTracing.cpp + lower/LowerTerminator.cpp + lower/LowerTranslator.cpp lower/llpcSpirvLowerUtil.cpp - lower/llpcSpirvProcessGpuRtLibrary.cpp - lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp + lower/ProcessGpuRtLibrary.cpp + lower/LowerInternalLibraryIntrinsic.cpp lower/LowerGLCompatibility.cpp - lower/llpcSpirvLowerCooperativeMatrix.cpp + lower/LowerCooperativeMatrix.cpp lower/PrepareContinuations.cpp lower/LowerAdvancedBlend.cpp lower/ProcessGfxRuntimeLibrary.cpp diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp index b4603a1c8b..433757dac3 100644 --- a/llpc/context/llpcCompiler.cpp +++ b/llpc/context/llpcCompiler.cpp @@ -31,6 +31,9 @@ #include "llpcCompiler.h" #include "LLVMSPIRVLib.h" #include "LowerAdvancedBlend.h" +#include "LowerCfgMerges.h" +#include "LowerRayTracing.h" +#include "LowerTranslator.h" #include "PrepareContinuations.h" #include "SPIRVEntry.h" #include "SPIRVFunction.h" @@ -47,9 +50,6 @@ #include "llpcRayTracingContext.h" #include "llpcShaderModuleHelper.h" #include "llpcSpirvLower.h" -#include "llpcSpirvLowerCfgMerges.h" -#include "llpcSpirvLowerRayTracing.h" -#include "llpcSpirvLowerTranslator.h" #include "llpcSpirvLowerUtil.h" #include "llpcThreading.h" #include "llpcTimerProfiler.h" @@ -1869,7 +1869,8 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRefaddPass(LowerAdvancedBlend(pipelineInfo->advancedBlendInfo.binding)); + lowerPassMgr->addPass( + LowerAdvancedBlend(pipelineInfo->advancedBlendInfo.binding, pipelineInfo->advancedBlendInfo.enableRov)); if (EnableOuts()) { lowerPassMgr->addPass(PrintModulePass( outs(), "\n" @@ -2694,6 +2695,7 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe summary.knownUnsetRayFlags &= knownFlags.Zero.getZExtValue(); pipelineOut->hasTraceRay = summary.hasTraceRayModule; + pipelineOut->hasKernelEntry = summary.hasKernelEntry; std::string summaryMsgpack = summary.encodeMsgpack(); void *allocBuf = nullptr; @@ -3106,7 +3108,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, // SPIR-V translation, then dump the result. lowerPassMgr->addPass(SpirvLowerTranslator(shaderInfoEntry->entryStage, shaderInfoEntry)); - lowerPassMgr->addPass(SpirvLowerCfgMerges()); + lowerPassMgr->addPass(LowerCfgMerges()); lowerPassMgr->addPass(AlwaysInlinerPass()); // Run the passes. @@ -3133,7 +3135,6 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, const bool isContinuationsMode = rtContext.isContinuationsMode(); - // TODO: Do not build launch kernel for library. std::unique_ptr entry = std::move(modules.back()); modules.pop_back(); shaderInfo = shaderInfo.drop_back(); @@ -3326,10 +3327,24 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, } // Build entry module at very last. - Result result = buildRayTracingPipelineElf(mainContext, std::move(entry), pipelineElfs[0], shaderProps, - moduleCallsTraceRay, 0, pipeline, timerProfiler); - if (result != Result::Success) - return result; +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 75 + const bool needEntry = true; +#else + const bool needEntry = rtContext.getRayTracingPipelineBuildInfo()->libraryMode != LibraryMode::Library; +#endif + if (needEntry) { + Result result = buildRayTracingPipelineElf(mainContext, std::move(entry), pipelineElfs[0], shaderProps, + moduleCallsTraceRay, 0, pipeline, timerProfiler); + if (result != Result::Success) + return result; + + rtContext.getRayTracingLibrarySummary().hasKernelEntry = true; + + } else { + // Do not build launch kernel for library. + assert(indirectStageMask == ShaderStageAllRayTracingBit); + pipelineElfs.erase(pipelineElfs.begin()); + } return hasError ? Result::ErrorInvalidShader : Result::Success; } @@ -3594,15 +3609,15 @@ void Compiler::buildShaderCacheHash(Context *context, unsigned stageMask, ArrayR auto shaderHashCode = MetroHash::compact64(&hash); if (stage == ShaderStageFragment) { fragmentHasher.Update(shaderHashCode); + // NOTE: In the case of the same fragment shader and fragment state, if fragment use generic builtIn or + // barycentric, we still need to consider previous shader, because previous shader will affect the inputs of + // fragment. const ShaderModuleData *moduleData = reinterpret_cast(shaderInfo->pModuleData); - if (moduleData && moduleData->usage.useBarycentric) { - // If fragment uses barycentrics, we still need to care about the previous stage, because the primitive type - // might be specified there. - if ((preStage != ShaderStageInvalid) && (preStage != ShaderStageVertex)) { - auto preShaderInfo = pipelineContext->getPipelineShaderInfo(preStage); - moduleData = reinterpret_cast(preShaderInfo->pModuleData); - fragmentHasher.Update(moduleData->cacheHash); - } + if (moduleData && (moduleData->usage.useBarycentric || moduleData->usage.useGenericBuiltIn)) { + assert(preStage != ShaderStageInvalid); + auto preShaderInfo = pipelineContext->getPipelineShaderInfo(preStage); + moduleData = reinterpret_cast(preShaderInfo->pModuleData); + fragmentHasher.Update(moduleData->cacheHash); } } else nonFragmentHasher.Update(shaderHashCode); diff --git a/llpc/context/llpcContext.cpp b/llpc/context/llpcContext.cpp index 6793151f55..c1476c6411 100644 --- a/llpc/context/llpcContext.cpp +++ b/llpc/context/llpcContext.cpp @@ -29,28 +29,29 @@ *********************************************************************************************************************** */ #include "llpcContext.h" +#include "LowerAccessChain.h" #include "LowerAdvancedBlend.h" +#include "LowerCfgMerges.h" +#include "LowerGlobals.h" +#include "LowerTranslator.h" #include "ProcessGfxRuntimeLibrary.h" +#include "ProcessGpuRtLibrary.h" #include "SPIRVInternal.h" -#include "gfxruntime/GfxRuntimeLibrary.h" #include "llpcCompiler.h" #include "llpcDebug.h" #include "llpcPipelineContext.h" #include "llpcSpirvLower.h" -#include "llpcSpirvLowerAccessChain.h" -#include "llpcSpirvLowerCfgMerges.h" -#include "llpcSpirvLowerGlobal.h" -#include "llpcSpirvLowerTranslator.h" -#include "llpcSpirvProcessGpuRtLibrary.h" #include "llpcTimerProfiler.h" +#include "vkgcMetroHash.h" +#include "gfxruntime/GfxRuntimeLibrary.h" #include "llvmraytracing/ContinuationsDialect.h" #include "llvmraytracing/GpurtContext.h" -#include "vkgcMetroHash.h" #include "lgc/Builder.h" #include "lgc/GpurtDialect.h" #include "lgc/LgcContext.h" #include "lgc/LgcCpsDialect.h" #include "lgc/LgcDialect.h" +#include "lgc/LgcIlCpsDialect.h" #include "lgc/LgcRtDialect.h" #include "lgc/LgcRtqDialect.h" #include "lgc/PassManager.h" @@ -84,6 +85,7 @@ using namespace lgc::rt; using namespace lgc::rtq; using namespace llvm; using namespace lgc::cps; +using namespace lgc::ilcps; namespace Llpc { @@ -91,8 +93,9 @@ namespace Llpc { // // @param gfxIp : Graphics IP version info Context::Context(GfxIpVersion gfxIp) : LLVMContext(), m_gfxIp(gfxIp) { - m_dialectContext = llvm_dialects::DialectContext::make(*this); + m_dialectContext = + llvm_dialects::DialectContext::make(*this); reset(); } @@ -278,11 +281,11 @@ void Context::ensureGpurtLibrary() { "// LLPC SPIRV-to-LLVM translation results\n")); } - lowerPassMgr->addPass(SpirvLowerCfgMerges()); - lowerPassMgr->addPass(SpirvProcessGpuRtLibrary()); + lowerPassMgr->addPass(LowerCfgMerges()); + lowerPassMgr->addPass(ProcessGpuRtLibrary()); lowerPassMgr->addPass(AlwaysInlinerPass()); - lowerPassMgr->addPass(SpirvLowerAccessChain()); - lowerPassMgr->addPass(SpirvLowerGlobal()); + lowerPassMgr->addPass(LowerAccessChain()); + lowerPassMgr->addPass(LowerGlobals()); // Run some basic optimization to simplify the code. This should be more efficient than optimizing them after they are // inlined into the caller. @@ -340,11 +343,11 @@ void Context::ensureGfxRuntimeLibrary() { "// LLPC SPIRV-to-LLVM translation results\n")); } - lowerPassMgr->addPass(SpirvLowerCfgMerges()); + lowerPassMgr->addPass(LowerCfgMerges()); lowerPassMgr->addPass(ProcessGfxRuntimeLibrary()); lowerPassMgr->addPass(AlwaysInlinerPass()); - lowerPassMgr->addPass(SpirvLowerAccessChain()); - lowerPassMgr->addPass(SpirvLowerGlobal()); + lowerPassMgr->addPass(LowerAccessChain()); + lowerPassMgr->addPass(LowerGlobals()); timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, false); lowerPassMgr->run(*gfxRuntime); diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp index 453253d363..762981b296 100644 --- a/llpc/context/llpcGraphicsContext.cpp +++ b/llpc/context/llpcGraphicsContext.cpp @@ -255,6 +255,9 @@ Options GraphicsContext::computePipelineOptions() const { options.useSoftwareVertexBufferDescriptors = pipelineInfo->useSoftwareVertexBufferDescriptors; options.vbAddressLowBitsKnown = pipelineInfo->getGlState().vbAddressLowBitsKnown; options.dynamicTopology = pipelineInfo->dynamicTopology; + options.enableMapClipDistMask = pipelineInfo->getGlState().enableMapClipDistMask; + options.clipPlaneMask = pipelineInfo->rsState.usrClipPlaneMask; + // Only set NGG options for a GFX10+ graphics pipeline. const auto &nggState = pipelineInfo->nggState; if (!nggState.enableNgg && getGfxIpVersion().major < 11) // GFX11+ must enable NGG diff --git a/llpc/include/llpc.h b/llpc/include/llpc.h index be52632f4b..f74d043222 100644 --- a/llpc/include/llpc.h +++ b/llpc/include/llpc.h @@ -158,6 +158,7 @@ struct RayTracingPipelineBuildOut { ///< when compiling in pure pipeline mode bool hasTraceRay; ///< Output whether have traceray module bool isCps; ///< Output whether is the pipeline is compiled in CPS mode + bool hasKernelEntry; ///< Output whether the output pipeline binaries contain kernel entry }; #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 66 diff --git a/llpc/lower/llpcSpirvLowerAccessChain.cpp b/llpc/lower/LowerAccessChain.cpp similarity index 93% rename from llpc/lower/llpcSpirvLowerAccessChain.cpp rename to llpc/lower/LowerAccessChain.cpp index 1221a407c4..3f9756d1c3 100644 --- a/llpc/lower/llpcSpirvLowerAccessChain.cpp +++ b/llpc/lower/LowerAccessChain.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerAccessChain.cpp - * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerAccessChain. + * @file LowerAccessChain.cpp + * @brief LLPC source file: contains implementation of class Llpc::LowerAccessChain. *********************************************************************************************************************** */ -#include "llpcSpirvLowerAccessChain.h" +#include "LowerAccessChain.h" #include "SPIRVInternal.h" #include "lgc/Builder.h" #include "llvm/IR/Instructions.h" @@ -36,7 +36,7 @@ #include "llvm/Support/raw_ostream.h" #include -#define DEBUG_TYPE "llpc-spirv-lower-access-chain" +#define DEBUG_TYPE "lower-access-chain" using namespace llvm; using namespace SPIRV; @@ -49,7 +49,7 @@ namespace Llpc { // // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerAccessChain::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses LowerAccessChain::run(Module &module, ModuleAnalysisManager &analysisManager) { LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Access-Chain\n"); SpirvLower::init(&module); @@ -69,7 +69,7 @@ PreservedAnalyses SpirvLowerAccessChain::run(Module &module, ModuleAnalysisManag // { { [4 x float] } } // // @param gep : Getelementptr instruction. -void SpirvLowerAccessChain::tryToAddMissingIndicesBetweenGVandGEP(GEPOperator *gep) { +void LowerAccessChain::tryToAddMissingIndicesBetweenGVandGEP(GEPOperator *gep) { // We are interested only in address spaces which are used while doing global value lowering for store and load. const unsigned addrSpace = gep->getType()->getPointerAddressSpace(); @@ -101,7 +101,7 @@ void SpirvLowerAccessChain::tryToAddMissingIndicesBetweenGVandGEP(GEPOperator *g // Visits "load" instruction // // @param loadInst : "Load" instruction -void SpirvLowerAccessChain::visitLoadInst(LoadInst &loadInst) { +void LowerAccessChain::visitLoadInst(LoadInst &loadInst) { if (GEPOperator *gep = dyn_cast(loadInst.getPointerOperand())) { m_builder->SetInsertPoint(&loadInst); tryToAddMissingIndicesBetweenGVandGEP(gep); @@ -112,7 +112,7 @@ void SpirvLowerAccessChain::visitLoadInst(LoadInst &loadInst) { // Visits "store" instruction // // @param storeInst : "Store" instruction -void SpirvLowerAccessChain::visitStoreInst(StoreInst &storeInst) { +void LowerAccessChain::visitStoreInst(StoreInst &storeInst) { if (GEPOperator *gep = dyn_cast(storeInst.getPointerOperand())) { m_builder->SetInsertPoint(&storeInst); tryToAddMissingIndicesBetweenGVandGEP(gep); @@ -123,7 +123,7 @@ void SpirvLowerAccessChain::visitStoreInst(StoreInst &storeInst) { // Visits "getelementptr" instruction. // // @param getElemPtrInst : "Getelementptr" instruction -void SpirvLowerAccessChain::visitGetElementPtrInst(GetElementPtrInst &getElemPtrInst) { +void LowerAccessChain::visitGetElementPtrInst(GetElementPtrInst &getElemPtrInst) { // NOTE: Here, we try to coalesce chained "getelementptr" instructions (created from multi-level access chain). // Because the metadata is always decorated on top-level pointer value (actually a global variable). const unsigned addrSpace = getElemPtrInst.getType()->getPointerAddressSpace(); @@ -151,7 +151,7 @@ void SpirvLowerAccessChain::visitGetElementPtrInst(GetElementPtrInst &getElemPtr // // @param getElemPtr : "getelementptr" instruction in the bottom to do coalescing // @param addrSpace : Address space of the pointer value of "getelementptr" -GetElementPtrInst *SpirvLowerAccessChain::tryToCoalesceChain(GetElementPtrInst *getElemPtr, unsigned addrSpace) { +GetElementPtrInst *LowerAccessChain::tryToCoalesceChain(GetElementPtrInst *getElemPtr, unsigned addrSpace) { GetElementPtrInst *coalescedGetElemPtr = getElemPtr; std::stack chainedInsts; // Order: from top to bottom @@ -262,8 +262,8 @@ GetElementPtrInst *SpirvLowerAccessChain::tryToCoalesceChain(GetElementPtrInst * // @param indexOperands : vector to which zero-index elements will be added // @param typeToMatch : type used as destination of unpacking "baseType" // @param baseType : packed type which will be unpacked. -void SpirvLowerAccessChain::appendZeroIndexToMatchTypes(SmallVectorImpl &indexOperands, Type *typeToMatch, - Type *baseType) { +void LowerAccessChain::appendZeroIndexToMatchTypes(SmallVectorImpl &indexOperands, Type *typeToMatch, + Type *baseType) { Type *unpackType = baseType; Value *zero = ConstantInt::get(Type::getInt32Ty(m_module->getContext()), 0); while (unpackType != typeToMatch) { diff --git a/llpc/lower/llpcSpirvLowerAccessChain.h b/llpc/lower/LowerAccessChain.h similarity index 91% rename from llpc/lower/llpcSpirvLowerAccessChain.h rename to llpc/lower/LowerAccessChain.h index c8f91148c4..4d551b91b6 100644 --- a/llpc/lower/llpcSpirvLowerAccessChain.h +++ b/llpc/lower/LowerAccessChain.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerAccessChain.h - * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerAccessChain. + * @file LowerAccessChain.h + * @brief LLPC header file: contains declaration of class Llpc::LowerAccessChain. *********************************************************************************************************************** */ #pragma once @@ -39,9 +39,9 @@ namespace Llpc { // ===================================================================================================================== // Represents the pass of SPIR-V lowering operations for access chain. -class SpirvLowerAccessChain : public SpirvLower, - public llvm::InstVisitor, - public llvm::PassInfoMixin { +class LowerAccessChain : public SpirvLower, + public llvm::InstVisitor, + public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); virtual void visitGetElementPtrInst(llvm::GetElementPtrInst &getElemPtrInst); diff --git a/llpc/lower/LowerAdvancedBlend.cpp b/llpc/lower/LowerAdvancedBlend.cpp index 6e937329dc..d2bd39174c 100644 --- a/llpc/lower/LowerAdvancedBlend.cpp +++ b/llpc/lower/LowerAdvancedBlend.cpp @@ -29,11 +29,11 @@ *********************************************************************************************************************** */ #include "LowerAdvancedBlend.h" +#include "LowerInternalLibraryIntrinsic.h" #include "SPIRVInternal.h" -#include "compilerutils/CompilerUtils.h" #include "llpcContext.h" -#include "llpcSpirvLowerInternalLibraryIntrinsicUtil.h" #include "vkgcDefs.h" +#include "compilerutils/CompilerUtils.h" #include "lgc/Builder.h" #include "lgc/RuntimeContext.h" @@ -45,11 +45,12 @@ using namespace Llpc; namespace Llpc { static const char *AdvancedBlendInternal = "AmdAdvancedBlendInternal"; +static const char *AdvancedBlendInternalRov = "AmdAdvancedBlendInternalRov"; static const char *AdvancedBlendModeName = "_mode"; static const char *AdvancedBlendIsMsaaName = "_isMsaa"; // ===================================================================================================================== -LowerAdvancedBlend::LowerAdvancedBlend(unsigned binding) : m_binding(binding) { +LowerAdvancedBlend::LowerAdvancedBlend(unsigned binding, bool enableRov) : m_binding(binding), m_enableRov(enableRov) { } // ===================================================================================================================== @@ -85,33 +86,53 @@ void LowerAdvancedBlend::processFsOutputs(Module &module) { if (global.getType()->getAddressSpace() == SPIRAS_Uniform && global.getName().ends_with(AdvancedBlendIsMsaaName)) isMsaaUniform = &global; } - // Prepare arguments of AmdAdvancedBlend(inColor, imageDescMs, imageDesc, fmaskDesc, mode, isMsaa) from shaderLibrary + m_builder->SetInsertPointPastAllocas(m_entryPoint); + SmallVector args; + args.push_back(nullptr); // placeholder for inColor + + if (!m_enableRov) { + // Prepare arguments of AmdAdvancedBlendInternal(inColor, imageDescMs, imageDesc, fmaskDesc, mode, isMsaa) + // Get the parameters and store them into the allocated parameter points + unsigned bindings[2] = {m_binding, m_binding + 1}; + Value *imageDesc[2] = {}; + for (unsigned id = 0; id < 2; ++id) { + unsigned descSet = + PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource); + imageDesc[id] = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorResource, + ResourceNodeType::DescriptorResource, descSet, bindings[id]); + imageDesc[id] = m_builder->CreatePtrToInt(imageDesc[id], m_builder->getInt64Ty()); + args.push_back(imageDesc[id]); + } - // Get the parameters and store them into the allocated parameter points - unsigned bindings[2] = {m_binding, m_binding + 1}; - Value *imageDesc[2] = {}; - for (unsigned id = 0; id < 2; ++id) { + unsigned descSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorFmask); + Value *fmaskDesc = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorFmask, ResourceNodeType::DescriptorFmask, + descSet, m_binding); + fmaskDesc = m_builder->CreatePtrToInt(fmaskDesc, m_builder->getInt64Ty()); + args.push_back(fmaskDesc); + } else { + // Prepare arguments of AmdAdvancedBlendInternalRov(inColor, rovDesc, mode, isMsaa) unsigned descSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource); - imageDesc[id] = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorResource, - ResourceNodeType::DescriptorResource, descSet, bindings[id]); - imageDesc[id] = m_builder->CreatePtrToInt(imageDesc[id], m_builder->getInt64Ty()); + Value *rovDesc = + m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorResource, ResourceNodeType::DescriptorResource, descSet, + Vkgc::InternalBinding::AdvancedBlendInternalBinding); + rovDesc = m_builder->CreatePtrToInt(rovDesc, m_builder->getInt64Ty()); + args.push_back(rovDesc); } - unsigned descSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorFmask); - Value *fmaskDesc = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorFmask, ResourceNodeType::DescriptorFmask, - descSet, m_binding); - fmaskDesc = m_builder->CreatePtrToInt(fmaskDesc, m_builder->getInt64Ty()); - assert(modeUniform && isMsaaUniform); modeUniform = m_builder->CreateLoad(m_builder->getInt32Ty(), modeUniform); + cast(modeUniform)->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {})); + args.push_back(modeUniform); - isMsaaUniform = - m_builder->CreateTrunc(m_builder->CreateLoad(m_builder->getInt32Ty(), isMsaaUniform), m_builder->getInt1Ty()); + isMsaaUniform = m_builder->CreateLoad(m_builder->getInt32Ty(), isMsaaUniform); + cast(isMsaaUniform)->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {})); + args.push_back(isMsaaUniform); // Link the gfxruntime library module GfxRuntimeContext &gfxRuntimeContext = GfxRuntimeContext::get(*m_context); - auto *advancedBlendFunc = (*gfxRuntimeContext.theModule).getFunction(AdvancedBlendInternal); + auto *advancedBlendFunc = + (*gfxRuntimeContext.theModule).getFunction(m_enableRov ? AdvancedBlendInternalRov : AdvancedBlendInternal); CompilerUtils::CrossModuleInliner inliner; @@ -121,12 +142,10 @@ void LowerAdvancedBlend::processFsOutputs(Module &module) { auto storeInst = cast(user); assert(storeInst); Value *srcVal = storeInst->getValueOperand(); + args[0] = srcVal; m_builder->SetInsertPoint(storeInst); - Value *blendColor = inliner - .inlineCall(*m_builder, advancedBlendFunc, - {srcVal, imageDesc[0], imageDesc[1], fmaskDesc, modeUniform, isMsaaUniform}) - .returnValue; + Value *blendColor = inliner.inlineCall(*m_builder, advancedBlendFunc, args).returnValue; storeInst->setOperand(0, blendColor); } diff --git a/llpc/lower/LowerAdvancedBlend.h b/llpc/lower/LowerAdvancedBlend.h index 3539368eeb..61412ecada 100644 --- a/llpc/lower/LowerAdvancedBlend.h +++ b/llpc/lower/LowerAdvancedBlend.h @@ -40,7 +40,7 @@ namespace Llpc { class LowerAdvancedBlend : public SpirvLower, public llvm::PassInfoMixin { public: - LowerAdvancedBlend(unsigned binding = 0); + LowerAdvancedBlend(unsigned binding, bool enableRov); llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); static llvm::StringRef name() { return "Lower SPIR-V advanced blend shader"; } @@ -49,5 +49,6 @@ class LowerAdvancedBlend : public SpirvLower, public llvm::PassInfoMixin &loopBlocks, // Determine all functions and block with a convergent function call. // // @param [in/out] module : LLVM module to be run on -void SpirvLowerCfgMerges::mapConvergentValues(Module &module) { +void LowerCfgMerges::mapConvergentValues(Module &module) { // Map convergent exposure for blocks and functions SmallVector worklist; @@ -304,8 +304,8 @@ void SpirvLowerCfgMerges::mapConvergentValues(Module &module) { // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerCfgMerges::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-CfgMerges\n"); +PreservedAnalyses LowerCfgMerges::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-CfgMerges\n"); LLVM_DEBUG(dbgs() << "Processing module: " << module); SpirvLower::init(&module); diff --git a/llpc/lower/llpcSpirvLowerCfgMerges.h b/llpc/lower/LowerCfgMerges.h similarity index 88% rename from llpc/lower/llpcSpirvLowerCfgMerges.h rename to llpc/lower/LowerCfgMerges.h index 112dfbbaa5..0cdba4218e 100644 --- a/llpc/lower/llpcSpirvLowerCfgMerges.h +++ b/llpc/lower/LowerCfgMerges.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerCfgMerges.h - * @brief LLPC header file: contains declaration of Llpc::SpirvLowerCfgMerges + * @file LowerCfgMerges.h + * @brief LLPC header file: contains declaration of Llpc::LowerCfgMerges *********************************************************************************************************************** */ #pragma once @@ -42,7 +42,7 @@ namespace Llpc { // ===================================================================================================================== // Represents the pass of SPIR-V lowering CFG merges. -class SpirvLowerCfgMerges : public SpirvLower, public llvm::PassInfoMixin { +class LowerCfgMerges : public SpirvLower, public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp b/llpc/lower/LowerConstImmediateStore.cpp similarity index 99% rename from llpc/lower/llpcSpirvLowerConstImmediateStore.cpp rename to llpc/lower/LowerConstImmediateStore.cpp index e98e629ec6..4c0ae2502e 100644 --- a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp +++ b/llpc/lower/LowerConstImmediateStore.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerConstImmediateStore.cpp + * @file LowerConstImmediateStore.cpp * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerConstImmediateStore. *********************************************************************************************************************** */ -#include "llpcSpirvLowerConstImmediateStore.h" +#include "LowerConstImmediateStore.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "llvm/Analysis/ValueTracking.h" diff --git a/llpc/lower/llpcSpirvLowerConstImmediateStore.h b/llpc/lower/LowerConstImmediateStore.h similarity index 98% rename from llpc/lower/llpcSpirvLowerConstImmediateStore.h rename to llpc/lower/LowerConstImmediateStore.h index 8fa3fd9bac..2df8c0b1c4 100644 --- a/llpc/lower/llpcSpirvLowerConstImmediateStore.h +++ b/llpc/lower/LowerConstImmediateStore.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerConstImmediateStore.h + * @file LowerConstImmediateStore.h * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerConstImmediateStore. *********************************************************************************************************************** */ diff --git a/llpc/lower/llpcSpirvLowerCooperativeMatrix.cpp b/llpc/lower/LowerCooperativeMatrix.cpp similarity index 97% rename from llpc/lower/llpcSpirvLowerCooperativeMatrix.cpp rename to llpc/lower/LowerCooperativeMatrix.cpp index a1dedf2ad4..432f8279c4 100644 --- a/llpc/lower/llpcSpirvLowerCooperativeMatrix.cpp +++ b/llpc/lower/LowerCooperativeMatrix.cpp @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerCooperativeMatrix.cpp + * @file LowerCooperativeMatrix.cpp * @brief LLPC source file: pass that lower SPIR-V specific cooperative matrix operations * * This currently only handles spirv.cooperative.matrix.proxy, which is used to proxy pointers to cooperative matrix @@ -32,7 +32,7 @@ *********************************************************************************************************************** */ -#include "llpcSpirvLowerCooperativeMatrix.h" +#include "LowerCooperativeMatrix.h" #include "llpcDialect.h" #include "lgc/BuilderCommon.h" #include "lgc/LgcDialect.h" diff --git a/llpc/lower/llpcSpirvLowerCooperativeMatrix.h b/llpc/lower/LowerCooperativeMatrix.h similarity index 94% rename from llpc/lower/llpcSpirvLowerCooperativeMatrix.h rename to llpc/lower/LowerCooperativeMatrix.h index dca651645f..ea15854624 100644 --- a/llpc/lower/llpcSpirvLowerCooperativeMatrix.h +++ b/llpc/lower/LowerCooperativeMatrix.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerCooperativeMatrix.h + * @file LowerCooperativeMatrix.h * @brief LLPC header file: lower SPIR-V specific cooperative matrix operations to LGC *********************************************************************************************************************** */ diff --git a/llpc/lower/LowerGLCompatibility.cpp b/llpc/lower/LowerGLCompatibility.cpp index 41e023b585..a10f5bcd36 100644 --- a/llpc/lower/LowerGLCompatibility.cpp +++ b/llpc/lower/LowerGLCompatibility.cpp @@ -68,7 +68,7 @@ PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManage if (!needLowerClipVertex() && !needLowerFrontColor() && !needLowerBackColor() && !needLowerFrontSecondaryColor() && !needLowerBackSecondaryColor() && !needEmulateDrawPixels() && !needEmulateTwoSideLighting() && - !needEmulateBitmap() && !needLowerFragColor() && !needEmulateSmoothStipple()) + !needEmulateBitmap() && !needLowerFragColor() && !needEmulateSmoothStipple() && !needLowerAlphaTest()) return PreservedAnalyses::all(); buildPatchPositionInfo(); @@ -91,6 +91,9 @@ PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManage if (needLowerFragColor()) lowerFragColor(); + if (needLowerAlphaTest()) + lowerAlphaTest(); + if (needEmulateDrawPixels()) emulateDrawPixels(); @@ -119,6 +122,8 @@ bool LowerGLCompatibility::needRun() { ->pModuleData); auto *buildInfo = static_cast(m_context->getPipelineBuildInfo()); auto options = m_context->getPipelineContext()->getPipelineOptions(); + bool enableAlphaTest = + (m_shaderStage == Vkgc::ShaderStageFragment && buildInfo->glState.alphaTestFunc != Vkgc::AlphaTestFunc::Always); result |= moduleData->usage.useClipVertex; result |= moduleData->usage.useFrontColor; result |= moduleData->usage.useBackColor; @@ -132,6 +137,7 @@ bool LowerGLCompatibility::needRun() { result |= options->getGlState().enablePolygonStipple; result |= options->getGlState().enableLineSmooth; result |= options->getGlState().enablePointSmooth; + result |= enableAlphaTest; } return result; } @@ -536,6 +542,13 @@ bool LowerGLCompatibility::needLowerFragColor() { return m_fragColor && (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.enableColorClampFs); } +// ===================================================================================================================== +// Check whether need do alphaTest. +bool LowerGLCompatibility::needLowerAlphaTest() { + auto buildInfo = static_cast(m_context->getPipelineBuildInfo()); + return (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.alphaTestFunc != Vkgc::AlphaTestFunc::Always); +} + // ===================================================================================================================== // Create InOut global variable Metadata. // @@ -1189,4 +1202,143 @@ void LowerGLCompatibility::lowerFragColor() { lowerColor(m_fragColor); } +// ===================================================================================================================== +// Does lowering operations for alpha test. +void LowerGLCompatibility::lowerAlphaTest() { + GlobalVariable *outputLocationZero = nullptr; + auto floatTy = m_builder->getFloatTy(); + Type *vec4Type = VectorType::get(floatTy, 4, false); + + for (GlobalVariable &global : m_module->globals()) { + if (global.getType()->getAddressSpace() == SPIRAS_Output) { + ShaderInOutMetadata outputMeta = {}; + MDNode *metaNode = global.getMetadata(gSPIRVMD::InOut); + auto *meta = mdconst::dyn_extract(metaNode->getOperand(0)); + outputMeta.U64All[0] = cast(meta->getOperand(0))->getZExtValue(); + outputMeta.U64All[1] = cast(meta->getOperand(1))->getZExtValue(); + + if (outputMeta.Value == 0) { + outputLocationZero = &global; + break; + } + } + } + + if (outputLocationZero != nullptr && outputLocationZero->getValueType()->isVectorTy()) { + auto type = cast(outputLocationZero->getValueType()); + uint32_t vectorNum = type->getNumElements(); + if (vectorNum != 4) + return; + } else + return; + + auto buildInfo = static_cast(m_context->getPipelineBuildInfo()); + auto predicate = CmpInst::Predicate::BAD_FCMP_PREDICATE; + + switch (buildInfo->glState.alphaTestFunc) { + case Vkgc::AlphaTestFunc::Always: { + // always pass, do nothing + return; + } + case Vkgc::AlphaTestFunc::Never: { + predicate = CmpInst::Predicate::FCMP_FALSE; + break; + } + case Vkgc::AlphaTestFunc::Less: { + predicate = CmpInst::Predicate::FCMP_OLT; + break; + } + case Vkgc::AlphaTestFunc::LEqual: { + predicate = CmpInst::Predicate::FCMP_OLE; + break; + } + case Vkgc::AlphaTestFunc::Equal: { + predicate = CmpInst::Predicate::FCMP_OEQ; + break; + } + case Vkgc::AlphaTestFunc::GEqual: { + predicate = CmpInst::Predicate::FCMP_OGE; + break; + } + case Vkgc::AlphaTestFunc::Greater: { + predicate = CmpInst::Predicate::FCMP_OGT; + break; + } + case Vkgc::AlphaTestFunc::NotEqual: { + predicate = CmpInst::Predicate::FCMP_ONE; + break; + } + } + + m_builder->SetInsertPoint(m_retInst); + auto lastBB = m_builder->GetInsertBlock(); + lastBB->splitBasicBlock(m_retInst); + m_builder->SetInsertPoint(lastBB->getTerminator()); + + // if the alpha test is never, then discard it + if (predicate == llvm::CmpInst::Predicate::FCMP_FALSE) { + // Always discard. + m_builder->CreateKill(); + return; + } + + // get mrt0.alpha + Value *outputValue = m_builder->CreateLoad(vec4Type, outputLocationZero); + Value *outputAlpha = m_builder->CreateExtractElement(outputValue, 3); + + // get alphaRef + auto alphaRef = new GlobalVariable(*m_module, floatTy, false, GlobalValue::ExternalLinkage, nullptr, "alphaTestRef", + nullptr, GlobalVariable::NotThreadLocal, SPIRV::SPIRAS_Uniform); + auto locationFound = + getUniformConstantEntryByLocation(m_context, m_shaderStage, Vkgc::GlCompatibilityUniformLocation::AlphaTestRef); + assert(locationFound != nullptr); + auto alphaTestBaseOffset = locationFound->offset; + unsigned constBufferBinding = + Vkgc::ConstantBuffer0Binding + static_cast(m_context->getPipelineContext()) + ->getPipelineShaderInfo(m_shaderStage) + ->options.constantBufferBindingOffset; + std::vector mDs; + auto int32Ty = Type::getInt32Ty(*m_context); + mDs.push_back(ConstantAsMetadata::get(ConstantInt::get(int32Ty, Vkgc::InternalDescriptorSetId))); + mDs.push_back(ConstantAsMetadata::get(ConstantInt::get(int32Ty, constBufferBinding))); + mDs.push_back(ConstantAsMetadata::get(ConstantInt::get(int32Ty, alphaTestBaseOffset))); + mDs.push_back(ConstantAsMetadata::get(ConstantInt::get(int32Ty, Vkgc::GlCompatibilityUniformLocation::AlphaTestRef))); + auto mdNode = MDNode::get(*m_context, mDs); + alphaRef->addMetadata(gSPIRVMD::UniformConstant, *mdNode); + + Value *refValue = m_builder->CreateLoad(floatTy, alphaRef); + + // br %1, label %.AlphaTestDiscard, label %.AlphaTestPass + // + // .AlphaTestDiscard: + // call void (...) @glc.create.kill() + // br label %2 + // + // .AlphaTestPass: + // br label %2 + // + // label %2: + // br label %3 + // + // label %3: + // terminator + auto *cond = dyn_cast(m_builder->CreateCmp(predicate, outputAlpha, refValue)); + + auto *compBB = m_builder->GetInsertBlock(); + auto *exitBB = compBB->splitBasicBlock(cond->getParent()->getTerminator()); + auto discardBB = BasicBlock::Create(*m_context, ".AlphaTestDiscard", cond->getFunction(), exitBB); + auto passBB = BasicBlock::Create(*m_context, ".AlphaTestPass", cond->getFunction(), exitBB); + + m_builder->SetInsertPoint(compBB); + compBB->getTerminator()->eraseFromParent(); + m_builder->CreateCondBr(cond, passBB, discardBB); + m_builder->SetInsertPoint(discardBB); + + m_builder->CreateKill(); + m_builder->CreateBr(exitBB); + + m_builder->SetInsertPoint(passBB); + m_builder->CreateBr(exitBB); +} + } // namespace Llpc diff --git a/llpc/lower/LowerGLCompatibility.h b/llpc/lower/LowerGLCompatibility.h index 1a4d19f433..a23c612c8e 100644 --- a/llpc/lower/LowerGLCompatibility.h +++ b/llpc/lower/LowerGLCompatibility.h @@ -70,6 +70,7 @@ class LowerGLCompatibility : public SpirvLower, public llvm::PassInfoMixin m_emitCalls; // "Call" instructions to emit vertex (geometry shader). llvm::ReturnInst *m_retInst; // "Return" of the entry point. diff --git a/llpc/lower/llpcSpirvLowerGlobal.cpp b/llpc/lower/LowerGlobals.cpp similarity index 96% rename from llpc/lower/llpcSpirvLowerGlobal.cpp rename to llpc/lower/LowerGlobals.cpp index 1a0b7b9e2f..22cc6c806a 100644 --- a/llpc/lower/llpcSpirvLowerGlobal.cpp +++ b/llpc/lower/LowerGlobals.cpp @@ -24,19 +24,19 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerGlobal.cpp - * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerGlobal. + * @file LowerGlobals.cpp + * @brief LLPC source file: contains implementation of class Llpc::LowerGlobals. *********************************************************************************************************************** */ -#include "llpcSpirvLowerGlobal.h" +#include "LowerGlobals.h" #include "SPIRVInternal.h" -#include "compilerutils/CompilerUtils.h" -#include "compilerutils/TypesMetadata.h" #include "llpcContext.h" #include "llpcDebug.h" #include "llpcGraphicsContext.h" #include "llpcRayTracingContext.h" #include "llpcSpirvLowerUtil.h" +#include "compilerutils/CompilerUtils.h" +#include "compilerutils/TypesMetadata.h" #include "lgc/LgcDialect.h" #include "lgc/LgcRtDialect.h" #include "llvm-dialects/Dialect/Visitor.h" @@ -51,7 +51,7 @@ #include "llvm/Transforms/Utils/Cloning.h" #include -#define DEBUG_TYPE "llpc-spirv-lower-global" +#define DEBUG_TYPE "lower-globals" using namespace llvm; using namespace SPIRV; @@ -189,7 +189,7 @@ static_assert(lgc::ShadingRateHorizontal4Pixels == "Shading rate flag mismatch"); // ===================================================================================================================== -SpirvLowerGlobal::SpirvLowerGlobal() : m_lastVertexProcessingStage(ShaderStageInvalid) { +LowerGlobals::LowerGlobals() : m_lastVertexProcessingStage(ShaderStageInvalid) { } // ===================================================================================================================== @@ -197,7 +197,7 @@ SpirvLowerGlobal::SpirvLowerGlobal() : m_lastVertexProcessingStage(ShaderStageIn // // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerGlobal::run(Module &module, ModuleAnalysisManager &analysisManager) { +PreservedAnalyses LowerGlobals::run(Module &module, ModuleAnalysisManager &analysisManager) { LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Global\n"); SpirvLower::init(&module); @@ -283,7 +283,7 @@ PreservedAnalyses SpirvLowerGlobal::run(Module &module, ModuleAnalysisManager &a // ===================================================================================================================== // add edgeflag input output -void SpirvLowerGlobal::lowerEdgeFlag() { +void LowerGlobals::lowerEdgeFlag() { const unsigned int edgeflagInputLocation = Vkgc::GlCompatibilityAttributeLocation::EdgeFlag; Llpc::PipelineContext *pipelineContext = m_context->getPipelineContext(); @@ -312,7 +312,7 @@ void SpirvLowerGlobal::lowerEdgeFlag() { // ===================================================================================================================== // Ensure that there is exactly one "ret" instruction. This is used for writing output variables for many shader types. -void SpirvLowerGlobal::ensureUnifiedReturn() { +void LowerGlobals::ensureUnifiedReturn() { SmallVector retInsts; for (BasicBlock &block : *m_entryPoint) { @@ -346,7 +346,7 @@ void SpirvLowerGlobal::ensureUnifiedReturn() { // // @param checkEmitCall : Whether we should handle emit call or not // @param checkInterpCall : Whether we should handle interpolate call or not -void SpirvLowerGlobal::handleCallInst(bool checkEmitCall, bool checkInterpCall) { +void LowerGlobals::handleCallInst(bool checkEmitCall, bool checkInterpCall) { assert(checkEmitCall != checkInterpCall); for (Function &function : m_module->functions()) { @@ -457,7 +457,7 @@ static bool hasPrimitiveIdx(const Constant &metaVal) { // Maps the specified global variable to proxy variable. // // @param globalVar : Global variable to be mapped -void SpirvLowerGlobal::mapGlobalVariableToProxy(GlobalVariable *globalVar) { +void LowerGlobals::mapGlobalVariableToProxy(GlobalVariable *globalVar) { const auto &dataLayout = m_module->getDataLayout(); Type *globalVarTy = globalVar->getValueType(); @@ -493,7 +493,7 @@ void SpirvLowerGlobal::mapGlobalVariableToProxy(GlobalVariable *globalVar) { // Lowers an input or output global variable. // // @param globalVar : the global variable to be lowered -void SpirvLowerGlobal::lowerInOut(llvm::GlobalVariable *globalVar) { +void LowerGlobals::lowerInOut(llvm::GlobalVariable *globalVar) { assert(globalVar->getAddressSpace() == SPIRAS_Input || globalVar->getAddressSpace() == SPIRAS_Output); const bool isInput = globalVar->getAddressSpace() == SPIRAS_Input; @@ -575,8 +575,8 @@ void SpirvLowerGlobal::lowerInOut(llvm::GlobalVariable *globalVar) { // to in-place import/export ops. // // This makes the assumption that GEPs have not been type-punned (though 0 indices may have been dropped). -void SpirvLowerGlobal::lowerInOutUsersInPlace(llvm::GlobalVariable *globalVar, llvm::Value *current, - SmallVectorImpl &indexStack) { +void LowerGlobals::lowerInOutUsersInPlace(llvm::GlobalVariable *globalVar, llvm::Value *current, + SmallVectorImpl &indexStack) { for (User *user : llvm::make_early_inc_range(current->users())) { Instruction *inst = cast(user); @@ -639,7 +639,7 @@ void SpirvLowerGlobal::lowerInOutUsersInPlace(llvm::GlobalVariable *globalVar, l // ===================================================================================================================== // @param builtIn : BuiltIn value // @param elemIdx : Element Index of struct -Value *SpirvLowerGlobal::createRaytracingBuiltIn(BuiltIn builtIn) { +Value *LowerGlobals::createRaytracingBuiltIn(BuiltIn builtIn) { switch (builtIn) { case BuiltInLaunchIdKHR: return m_builder->create(); @@ -719,10 +719,10 @@ inline bool isRayTracingBuiltIn(unsigned builtIn, ShaderStage stage) { // "InterpLocSample" - Value is offset from the center of the pixel for "InterpLocCenter" - Value is vertex no. (0 ~ 2) // for "InterpLocCustom" // @param isPerVertexDimension : Whether this is a per vertex variable -Value *SpirvLowerGlobal::addCallInstForInOutImport(Type *inOutTy, unsigned addrSpace, Constant *inOutMetaVal, - Value *locOffset, unsigned maxLocOffset, Value *elemIdx, - Value *vertexIdx, unsigned interpLoc, Value *auxInterpValue, - bool isPerVertexDimension) { +Value *LowerGlobals::addCallInstForInOutImport(Type *inOutTy, unsigned addrSpace, Constant *inOutMetaVal, + Value *locOffset, unsigned maxLocOffset, Value *elemIdx, + Value *vertexIdx, unsigned interpLoc, Value *auxInterpValue, + bool isPerVertexDimension) { assert(addrSpace == SPIRAS_Input || (addrSpace == SPIRAS_Output && m_shaderStage == ShaderStageTessControl)); Value *inOutValue = PoisonValue::get(inOutTy); @@ -983,10 +983,9 @@ Value *SpirvLowerGlobal::addCallInstForInOutImport(Type *inOutTy, unsigned addrS // vector component index, for built-in input/output, it could be element index of scalar array) // @param vertexOrPrimitiveIdx : Output array outermost index used for vertex indexing // @param emitStreamId : ID of emitted vertex stream, valid for geometry shader (0xFFFFFFFF for others) -void SpirvLowerGlobal::addCallInstForOutputExport(Value *outputValue, Constant *outputMetaVal, Value *locOffset, - unsigned maxLocOffset, unsigned xfbOffsetAdjust, - unsigned xfbBufferAdjust, Value *elemIdx, Value *vertexOrPrimitiveIdx, - unsigned emitStreamId) { +void LowerGlobals::addCallInstForOutputExport(Value *outputValue, Constant *outputMetaVal, Value *locOffset, + unsigned maxLocOffset, unsigned xfbOffsetAdjust, unsigned xfbBufferAdjust, + Value *elemIdx, Value *vertexOrPrimitiveIdx, unsigned emitStreamId) { Type *outputTy = outputValue->getType(); ShaderInOutMetadata outputMeta = {}; @@ -1156,9 +1155,9 @@ void SpirvLowerGlobal::addCallInstForOutputExport(Value *outputValue, Constant * // - Sample ID for "InterpLocSample" // - Offset from the center of the pixel for "InterpLocCenter" // - Vertex no. (0 ~ 2) for "InterpLocCustom" -Value *SpirvLowerGlobal::loadDynamicIndexedMembers(Type *inOutTy, unsigned addrSpace, ArrayRef indexOperands, - Constant *inOutMetaVal, Value *locOffset, unsigned interpLoc, - Value *auxInterpValue, bool isPerVertexDimension) { +Value *LowerGlobals::loadDynamicIndexedMembers(Type *inOutTy, unsigned addrSpace, ArrayRef indexOperands, + Constant *inOutMetaVal, Value *locOffset, unsigned interpLoc, + Value *auxInterpValue, bool isPerVertexDimension) { // Currently this is only used in fragment shader on loading interpolate sources. assert(m_shaderStage == ShaderStageFragment); @@ -1261,10 +1260,9 @@ Value *SpirvLowerGlobal::loadDynamicIndexedMembers(Type *inOutTy, unsigned addrS // @param auxInterpValue : Auxiliary value of interpolation (valid for fragment shader): - Sample ID for // "InterpLocSample" - Offset from the center of the pixel for "InterpLocCenter" - Vertex no. (0 ~ 2) for // "InterpLocCustom" -Value *SpirvLowerGlobal::loadInOutMember(Type *inOutTy, Type *loadTy, unsigned addrSpace, - ArrayRef indexOperands, unsigned maxLocOffset, Constant *inOutMetaVal, - Value *locOffset, Value *vertexIdx, unsigned interpLoc, Value *auxInterpValue, - bool isPerVertexDimension) { +Value *LowerGlobals::loadInOutMember(Type *inOutTy, Type *loadTy, unsigned addrSpace, ArrayRef indexOperands, + unsigned maxLocOffset, Constant *inOutMetaVal, Value *locOffset, Value *vertexIdx, + unsigned interpLoc, Value *auxInterpValue, bool isPerVertexDimension) { assert(m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageTessEval || m_shaderStage == ShaderStageMesh || m_shaderStage == ShaderStageFragment); @@ -1363,9 +1361,9 @@ Value *SpirvLowerGlobal::loadInOutMember(Type *inOutTy, Type *loadTy, unsigned a // @param outputMetaVal : Metadata of this output member // @param locOffset : Relative location offset of this output member // @param vertexOrPrimitiveIdx : Input array outermost index used for vertex indexing -void SpirvLowerGlobal::storeOutputMember(Type *outputTy, Type *storeTy, Value *storeValue, - ArrayRef indexOperands, unsigned maxLocOffset, - Constant *outputMetaVal, Value *locOffset, Value *vertexOrPrimitiveIdx) { +void LowerGlobals::storeOutputMember(Type *outputTy, Type *storeTy, Value *storeValue, ArrayRef indexOperands, + unsigned maxLocOffset, Constant *outputMetaVal, Value *locOffset, + Value *vertexOrPrimitiveIdx) { assert(m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageMesh); // indexOperands can be empty with mismatch of types, if zero-index GEP was removed and global is used directly by @@ -1446,7 +1444,7 @@ void SpirvLowerGlobal::storeOutputMember(Type *outputTy, Type *storeTy, Value *s // ===================================================================================================================== // Lowers buffer blocks. -void SpirvLowerGlobal::lowerBufferBlock() { +void LowerGlobals::lowerBufferBlock() { SmallVector globalsToRemove; // With opaque pointers actually any instruction can be the user of the global variable since, zero-index GEPs @@ -1807,7 +1805,7 @@ void SpirvLowerGlobal::lowerBufferBlock() { // ===================================================================================================================== // Lowers aliased variables. -void SpirvLowerGlobal::lowerAliasedVal() { +void LowerGlobals::lowerAliasedVal() { // NOTE: When enable CapabilityWorkgroupMemoryExplicitLayoutKHR, Workgroup variables can be declared in blocks, // and then use the same explicit layout decorations (e.g. Offset, ArrayStride) as other storage classes. All the // Workgroup blocks share the same underlying storage, and either all or none of the variables must be explicitly @@ -1843,7 +1841,7 @@ void SpirvLowerGlobal::lowerAliasedVal() { // ===================================================================================================================== // Lowers task payload. -void SpirvLowerGlobal::lowerTaskPayload() { +void LowerGlobals::lowerTaskPayload() { GlobalVariable *globalToRemove = nullptr; for (GlobalVariable &global : m_module->globals()) { @@ -1886,7 +1884,7 @@ void SpirvLowerGlobal::lowerTaskPayload() { // ===================================================================================================================== // Lowers push constants. -void SpirvLowerGlobal::lowerPushConsts() { +void LowerGlobals::lowerPushConsts() { SmallVector globalsToRemove; for (GlobalVariable &global : m_module->globals()) { @@ -1947,7 +1945,7 @@ void SpirvLowerGlobal::lowerPushConsts() { // ===================================================================================================================== // Lowers uniform constants. -void SpirvLowerGlobal::lowerUniformConstants() { +void LowerGlobals::lowerUniformConstants() { SmallVector globalsToRemove; for (GlobalVariable &global : m_module->globals()) { @@ -1999,8 +1997,8 @@ void SpirvLowerGlobal::lowerUniformConstants() { // @param callInst : "Call" instruction // @param indexOperands : indices of GEP instruction // @param gv : Global Variable instruction -Value *SpirvLowerGlobal::interpolateInputElement(Type *returnTy, unsigned interpLoc, Value *auxInterpValue, - GlobalVariable *gv, ArrayRef indexOperands) { +Value *LowerGlobals::interpolateInputElement(Type *returnTy, unsigned interpLoc, Value *auxInterpValue, + GlobalVariable *gv, ArrayRef indexOperands) { assert((indexOperands.empty() || cast(indexOperands.front())->isZero()) && "Non-zero GEP first index\n"); auto inputTy = gv->getValueType(); @@ -2045,7 +2043,7 @@ Value *SpirvLowerGlobal::interpolateInputElement(Type *returnTy, unsigned interp // ===================================================================================================================== // Fill the XFB info map from the Vkgc::ApiXfbOutData if XFB is specified by API interface -void SpirvLowerGlobal::buildApiXfbMap() { +void LowerGlobals::buildApiXfbMap() { auto pipelineBuildInfo = static_cast(m_context->getPipelineBuildInfo()); for (unsigned idx = 0; idx < pipelineBuildInfo->getGlState().apiXfbOutData.numXfbOutInfo; ++idx) { const auto &xfbInfo = pipelineBuildInfo->getGlState().apiXfbOutData.pXfbOutInfos[idx]; @@ -2070,17 +2068,13 @@ void SpirvLowerGlobal::buildApiXfbMap() { // @param xfbOffsetAdjust : Adjustment of transform feedback offset (for array type) // @param locOffset : Relative location offset, passed from aggregate type // @param outputInfo : Extra output info (GS stream ID) -void SpirvLowerGlobal::addCallInstForXfbOutput(const ShaderInOutMetadata &outputMeta, Value *outputValue, - unsigned xfbBufferAdjust, unsigned xfbOffsetAdjust, unsigned locOffset, - lgc::InOutInfo outputInfo) { +void LowerGlobals::addCallInstForXfbOutput(const ShaderInOutMetadata &outputMeta, Value *outputValue, + unsigned xfbBufferAdjust, unsigned xfbOffsetAdjust, unsigned locOffset, + lgc::InOutInfo outputInfo) { assert(m_shaderStage == m_lastVertexProcessingStage); DenseMap *locXfbMapPtr = outputMeta.IsBuiltIn ? &m_builtInXfbMap : &m_genericXfbMap; bool hasXfbMetadata = m_entryPoint->getMetadata(lgc::XfbStateMetadataName); bool hasXfbOut = hasXfbMetadata && (!locXfbMapPtr->empty() || outputMeta.IsXfb); -#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 70 - auto pipelineBuildInfo = static_cast(m_context->getPipelineBuildInfo()); - hasXfbOut &= !pipelineBuildInfo->apiXfbOutData.forceDisableStreamOut; -#endif if (!hasXfbOut) return; @@ -2140,7 +2134,7 @@ void SpirvLowerGlobal::addCallInstForXfbOutput(const ShaderInOutMetadata &output // ===================================================================================================================== // Lowers shader record buffer. -void SpirvLowerGlobal::lowerShaderRecordBuffer() { +void LowerGlobals::lowerShaderRecordBuffer() { // Note: Only ray tracing pipeline has shader record buffer if (m_context->getPipelineType() != PipelineType::RayTracing) return; @@ -2170,7 +2164,7 @@ void SpirvLowerGlobal::lowerShaderRecordBuffer() { // // @param input : Input to be handled // @param proxy : Proxy of the input -void SpirvLowerGlobal::handleVolatileInput(GlobalVariable *input, Value *proxy) { +void LowerGlobals::handleVolatileInput(GlobalVariable *input, Value *proxy) { // For now, only check for RayTCurrent (BuiltInRayTmaxKHR, BuiltInHitTNV) in intersection shader. // TODO: Maybe also needed for BuiltInSubgroupLocalInvocationId and related. if (!input->getValueType()->isFloatTy()) @@ -2225,7 +2219,7 @@ void SpirvLowerGlobal::handleVolatileInput(GlobalVariable *input, Value *proxy) // ===================================================================================================================== // Changes function signature for RT shaders. Specifically, add payload / hit attribute / callable data pointers and // metadata to function signature. -void SpirvLowerGlobal::changeRtFunctionSignature() { +void LowerGlobals::changeRtFunctionSignature() { if (!isRayTracingShaderStage(m_shaderStage)) return; diff --git a/llpc/lower/llpcSpirvLowerGlobal.h b/llpc/lower/LowerGlobals.h similarity index 97% rename from llpc/lower/llpcSpirvLowerGlobal.h rename to llpc/lower/LowerGlobals.h index 700f9c870b..86ef63bc06 100644 --- a/llpc/lower/llpcSpirvLowerGlobal.h +++ b/llpc/lower/LowerGlobals.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerGlobal.h - * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerGlobal. + * @file LowerGlobals.h + * @brief LLPC header file: contains declaration of class Llpc::LowerGlobals. *********************************************************************************************************************** */ #pragma once @@ -45,9 +45,9 @@ namespace Llpc { // ===================================================================================================================== // Represents the pass of SPIR-V lowering operations for globals (global variables, inputs, and outputs). -class SpirvLowerGlobal : public SpirvLower, public llvm::PassInfoMixin { +class LowerGlobals : public SpirvLower, public llvm::PassInfoMixin { public: - SpirvLowerGlobal(); + LowerGlobals(); llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); void handleCallInst(bool checkEmitCall, bool checkInterpCall); diff --git a/llpc/lower/llpcSpirvLowerInstMetaRemove.cpp b/llpc/lower/LowerInstMetaRemove.cpp similarity index 97% rename from llpc/lower/llpcSpirvLowerInstMetaRemove.cpp rename to llpc/lower/LowerInstMetaRemove.cpp index 428b54011f..2a826d8cad 100644 --- a/llpc/lower/llpcSpirvLowerInstMetaRemove.cpp +++ b/llpc/lower/LowerInstMetaRemove.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerInstMetaRemove.cpp + * @file LowerInstMetaRemove.cpp * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerInstMetaRemove. *********************************************************************************************************************** */ -#include "llpcSpirvLowerInstMetaRemove.h" +#include "LowerInstMetaRemove.h" #include "SPIRVInternal.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" diff --git a/llpc/lower/llpcSpirvLowerInstMetaRemove.h b/llpc/lower/LowerInstMetaRemove.h similarity index 98% rename from llpc/lower/llpcSpirvLowerInstMetaRemove.h rename to llpc/lower/LowerInstMetaRemove.h index 8ce50b7950..7a5343abe9 100644 --- a/llpc/lower/llpcSpirvLowerInstMetaRemove.h +++ b/llpc/lower/LowerInstMetaRemove.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerInstMetaRemove.h + * @file LowerInstMetaRemove.h * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerInstMetaRemove. *********************************************************************************************************************** */ diff --git a/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp b/llpc/lower/LowerInternalLibraryIntrinsic.cpp similarity index 98% rename from llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp rename to llpc/lower/LowerInternalLibraryIntrinsic.cpp index 65a0bcef1e..2b184a0006 100644 --- a/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp +++ b/llpc/lower/LowerInternalLibraryIntrinsic.cpp @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,12 +24,12 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp + * @file LowerInternalLibraryIntrinsic.cpp * @brief LLPC source file: utilities for lowering common internal library intrinsics. *********************************************************************************************************************** */ -#include "llpcSpirvLowerInternalLibraryIntrinsicUtil.h" +#include "LowerInternalLibraryIntrinsic.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "lgc/Builder.h" diff --git a/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.h b/llpc/lower/LowerInternalLibraryIntrinsic.h similarity index 94% rename from llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.h rename to llpc/lower/LowerInternalLibraryIntrinsic.h index 63ab5549ce..a2233b2e8b 100644 --- a/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.h +++ b/llpc/lower/LowerInternalLibraryIntrinsic.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp + * @file LowerInternalLibraryIntrinsic.h * @brief LLPC header file: utilities for lowering common internal library intrinsics. *********************************************************************************************************************** */ diff --git a/llpc/lower/llpcSpirvLowerMath.cpp b/llpc/lower/LowerMath.cpp similarity index 99% rename from llpc/lower/llpcSpirvLowerMath.cpp rename to llpc/lower/LowerMath.cpp index 44c07f5413..f5280171a4 100644 --- a/llpc/lower/llpcSpirvLowerMath.cpp +++ b/llpc/lower/LowerMath.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerMath.cpp + * @file LowerMath.cpp * @brief LLPC source file: implementations of Llpc::SpirvLowerMathConstFolding and Llpc::SpirvLowerMathFloatOp. *********************************************************************************************************************** */ -#include "llpcSpirvLowerMath.h" +#include "LowerMath.h" #include "SPIRVInternal.h" #include "hex_float.h" #include "llpcContext.h" diff --git a/llpc/lower/llpcSpirvLowerMath.h b/llpc/lower/LowerMath.h similarity index 97% rename from llpc/lower/llpcSpirvLowerMath.h rename to llpc/lower/LowerMath.h index 5899096e5a..d4c30d8f4c 100644 --- a/llpc/lower/llpcSpirvLowerMath.h +++ b/llpc/lower/LowerMath.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerMath.h + * @file LowerMath.h * @brief LLPC header file: contains declarations of math lowering classes *********************************************************************************************************************** */ diff --git a/llpc/lower/llpcSpirvLowerMemoryOp.cpp b/llpc/lower/LowerMemoryOp.cpp similarity index 92% rename from llpc/lower/llpcSpirvLowerMemoryOp.cpp rename to llpc/lower/LowerMemoryOp.cpp index 35fe5b0e41..7b98ea7f02 100644 --- a/llpc/lower/llpcSpirvLowerMemoryOp.cpp +++ b/llpc/lower/LowerMemoryOp.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerMemoryOp.cpp - * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerMemoryOp. + * @file LowerMemoryOp.cpp + * @brief LLPC source file: contains implementation of class Llpc::LowerMemoryOp. *********************************************************************************************************************** */ -#include "llpcSpirvLowerMemoryOp.h" +#include "LowerMemoryOp.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "llvm/IR/Instructions.h" @@ -36,7 +36,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#define DEBUG_TYPE "llpc-spirv-lower-memory-op" +#define DEBUG_TYPE "lower-memory-op" using namespace llvm; using namespace SPIRV; @@ -49,8 +49,8 @@ namespace Llpc { // // @param [in/out] module : LLVM module to be run on // @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerMemoryOp::run(Module &module, ModuleAnalysisManager &analysisManager) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Memory-Op\n"); +PreservedAnalyses LowerMemoryOp::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the pass Lower-Memory-Op\n"); SpirvLower::init(&module); @@ -77,7 +77,7 @@ PreservedAnalyses SpirvLowerMemoryOp::run(Module &module, ModuleAnalysisManager } m_removeInsts.clear(); - LLVM_DEBUG(dbgs() << "After the pass Spirv-Lower-Memory-Op " << module); + LLVM_DEBUG(dbgs() << "After the pass Lower-Memory-Op " << module); return PreservedAnalyses::none(); } @@ -86,7 +86,7 @@ PreservedAnalyses SpirvLowerMemoryOp::run(Module &module, ModuleAnalysisManager // Visits "extractelement" instruction. // // @param extractElementInst : "ExtractElement" instruction -void SpirvLowerMemoryOp::visitExtractElementInst(ExtractElementInst &extractElementInst) { +void LowerMemoryOp::visitExtractElementInst(ExtractElementInst &extractElementInst) { auto src = extractElementInst.getOperand(0); if (src->getType()->isVectorTy() && isa(src) && src->hasOneUse()) { // NOTE: Optimize loading vector component for local variable and memory block @@ -118,7 +118,7 @@ void SpirvLowerMemoryOp::visitExtractElementInst(ExtractElementInst &extractElem // Visits "getelementptr" instruction. // // @param getElemPtrInst : "GetElementPtr" instruction -void SpirvLowerMemoryOp::visitGetElementPtrInst(GetElementPtrInst &getElemPtrInst) { +void LowerMemoryOp::visitGetElementPtrInst(GetElementPtrInst &getElemPtrInst) { unsigned operandIndex = InvalidValue; unsigned dynIndexBound = 0; @@ -166,8 +166,8 @@ void SpirvLowerMemoryOp::visitGetElementPtrInst(GetElementPtrInst &getElemPtrIns // @param getElemPtr : "GetElementPtr" instruction // @param [out] operandIndexOut : Index of the operand that represents a dynamic index // @param [out] dynIndexBound : Upper bound of dynamic index -bool SpirvLowerMemoryOp::needExpandDynamicIndex(GetElementPtrInst *getElemPtr, unsigned *operandIndexOut, - unsigned *dynIndexBound) const { +bool LowerMemoryOp::needExpandDynamicIndex(GetElementPtrInst *getElemPtr, unsigned *operandIndexOut, + unsigned *dynIndexBound) const { static const unsigned MaxDynIndexBound = 8; std::vector idxs; @@ -244,8 +244,7 @@ bool SpirvLowerMemoryOp::needExpandDynamicIndex(GetElementPtrInst *getElemPtr, u // @param loadInst : "Load" instruction // @param getElemPtrs : A group of "getelementptr" with constant indices // @param dynIndex : Dynamic index -void SpirvLowerMemoryOp::expandLoadInst(LoadInst *loadInst, ArrayRef getElemPtrs, - Value *dynIndex) { +void LowerMemoryOp::expandLoadInst(LoadInst *loadInst, ArrayRef getElemPtrs, Value *dynIndex) { // Expand is something like this: // // firstValue = load getElemPtrs[0] @@ -286,8 +285,8 @@ void SpirvLowerMemoryOp::expandLoadInst(LoadInst *loadInst, ArrayRef getElemPtrs, - Value *dynIndex) { +void LowerMemoryOp::recordStoreExpandInfo(StoreInst *storeInst, ArrayRef getElemPtrs, + Value *dynIndex) { StoreExpandInfo expandInfo = {}; expandInfo.storeInst = storeInst; expandInfo.dynIndex = dynIndex; @@ -304,8 +303,7 @@ void SpirvLowerMemoryOp::recordStoreExpandInfo(StoreInst *storeInst, ArrayRef getElemPtrs, - Value *dynIndex) { +void LowerMemoryOp::expandStoreInst(StoreInst *storeInst, ArrayRef getElemPtrs, Value *dynIndex) { const bool robustBufferAccess = m_context->getPipelineContext()->getPipelineOptions()->robustBufferAccess; const unsigned getElemPtrCount = getElemPtrs.size(); bool isType64 = (dynIndex->getType()->getPrimitiveSizeInBits() == 64); diff --git a/llpc/lower/llpcSpirvLowerMemoryOp.h b/llpc/lower/LowerMemoryOp.h similarity index 93% rename from llpc/lower/llpcSpirvLowerMemoryOp.h rename to llpc/lower/LowerMemoryOp.h index 7e0dd5721c..842f4c1d8b 100644 --- a/llpc/lower/llpcSpirvLowerMemoryOp.h +++ b/llpc/lower/LowerMemoryOp.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerMemoryOp.h - * @brief LLPC header file: contains declaration of class Llpc::SpirvLowerMemoryOp. + * @file LowerMemoryOp.h + * @brief LLPC header file: contains declaration of class Llpc::LowerMemoryOp. *********************************************************************************************************************** */ #pragma once @@ -52,9 +52,9 @@ struct StoreExpandInfo { // ===================================================================================================================== // Represents the pass of SPIR-V lowering memory operations. -class SpirvLowerMemoryOp : public SpirvLower, - public llvm::InstVisitor, - public llvm::PassInfoMixin { +class LowerMemoryOp : public SpirvLower, + public llvm::InstVisitor, + public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); diff --git a/llpc/lower/llpcSpirvLowerRayTracing.cpp b/llpc/lower/LowerRayTracing.cpp similarity index 99% rename from llpc/lower/llpcSpirvLowerRayTracing.cpp rename to llpc/lower/LowerRayTracing.cpp index d40772b741..a9d0800e36 100644 --- a/llpc/lower/llpcSpirvLowerRayTracing.cpp +++ b/llpc/lower/LowerRayTracing.cpp @@ -24,18 +24,18 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerRayTracing.cpp + * @file LowerRayTracing.cpp * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerRayTracing. *********************************************************************************************************************** */ -#include "llpcSpirvLowerRayTracing.h" +#include "LowerRayTracing.h" #include "SPIRVInternal.h" -#include "compilerutils/CompilerUtils.h" #include "gpurt-compiler.h" #include "llpcContext.h" #include "llpcRayTracingContext.h" #include "llpcSpirvLowerUtil.h" +#include "compilerutils/CompilerUtils.h" #include "llvmraytracing/ContinuationsUtil.h" #include "llvmraytracing/GpurtContext.h" #include "lgc/Builder.h" diff --git a/llpc/lower/llpcSpirvLowerRayTracing.h b/llpc/lower/LowerRayTracing.h similarity index 99% rename from llpc/lower/llpcSpirvLowerRayTracing.h rename to llpc/lower/LowerRayTracing.h index 6bc3539098..8eeee82d1b 100644 --- a/llpc/lower/llpcSpirvLowerRayTracing.h +++ b/llpc/lower/LowerRayTracing.h @@ -24,15 +24,15 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerRayTracing.h + * @file LowerRayTracing.h * @brief LLPC header file: contains declaration of Llpc::SpirvLowerRayTracing *********************************************************************************************************************** */ #pragma once #include "SPIRVInternal.h" -#include "compilerutils/CompilerUtils.h" #include "llpcSpirvLower.h" +#include "compilerutils/CompilerUtils.h" #include "llvm/ADT/SmallSet.h" #include "llvm/IR/PassManager.h" #include diff --git a/llpc/lower/llpcSpirvLowerTerminator.cpp b/llpc/lower/LowerTerminator.cpp similarity index 98% rename from llpc/lower/llpcSpirvLowerTerminator.cpp rename to llpc/lower/LowerTerminator.cpp index 63ba34bf65..6544d37896 100644 --- a/llpc/lower/llpcSpirvLowerTerminator.cpp +++ b/llpc/lower/LowerTerminator.cpp @@ -24,13 +24,13 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerTerminator.cpp + * @file LowerTerminator.cpp * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerTerminator. * @details This pass removes trailing instructions after known terminators. * These dead instructions can occur when functions calling terminators, such as OpKill, are inlined. *********************************************************************************************************************** */ -#include "llpcSpirvLowerTerminator.h" +#include "LowerTerminator.h" #include "SPIRVInternal.h" #include "llpcContext.h" #include "llpcDebug.h" diff --git a/llpc/lower/llpcSpirvLowerTerminator.h b/llpc/lower/LowerTerminator.h similarity index 95% rename from llpc/lower/llpcSpirvLowerTerminator.h rename to llpc/lower/LowerTerminator.h index f6a107e732..bb472d9768 100644 --- a/llpc/lower/llpcSpirvLowerTerminator.h +++ b/llpc/lower/LowerTerminator.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerTerminator.h + * @file LowerTerminator.h * @brief LLPC header file: contains declaration of Llpc::SpirvLowerTerminator *********************************************************************************************************************** */ diff --git a/llpc/lower/llpcSpirvLowerTranslator.cpp b/llpc/lower/LowerTranslator.cpp similarity index 98% rename from llpc/lower/llpcSpirvLowerTranslator.cpp rename to llpc/lower/LowerTranslator.cpp index bcb8dc25d2..0c86246823 100644 --- a/llpc/lower/llpcSpirvLowerTranslator.cpp +++ b/llpc/lower/LowerTranslator.cpp @@ -24,11 +24,11 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerTranslator.cpp + * @file LowerTranslator.cpp * @brief LLPC source file: contains implementation of Llpc::SpirvLowerTranslator *********************************************************************************************************************** */ -#include "llpcSpirvLowerTranslator.h" +#include "LowerTranslator.h" #include "LLVMSPIRVLib.h" #include "llpcCompiler.h" #include "llpcContext.h" diff --git a/llpc/lower/llpcSpirvLowerTranslator.h b/llpc/lower/LowerTranslator.h similarity index 98% rename from llpc/lower/llpcSpirvLowerTranslator.h rename to llpc/lower/LowerTranslator.h index 78389ac002..612e150376 100644 --- a/llpc/lower/llpcSpirvLowerTranslator.h +++ b/llpc/lower/LowerTranslator.h @@ -24,7 +24,7 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvLowerTranslator.h + * @file LowerTranslator.h * @brief LLPC header file: contains declaration of Llpc::SpirvLowerTranslator *********************************************************************************************************************** */ diff --git a/llpc/lower/PassRegistry.inc b/llpc/lower/PassRegistry.inc index 940cd516e7..73cf809b15 100644 --- a/llpc/lower/PassRegistry.inc +++ b/llpc/lower/PassRegistry.inc @@ -42,17 +42,17 @@ LLPC_MODULE_PASS("instcombine", InstCombinePass) LLPC_MODULE_PASS("simplifycfg", SimplifyCFGPass) LLPC_MODULE_PASS("early-cse", EarlyCSEPass) LLPC_MODULE_PASS("llpc-spirv-lower-gl-compatibility", LowerGLCompatibility) -LLPC_MODULE_PASS("llpc-spirv-lower-access-chain", SpirvLowerAccessChain) -LLPC_MODULE_PASS("llpc-spirv-lower-cfg-merges", SpirvLowerCfgMerges) +LLPC_MODULE_PASS("lower-access-chain", LowerAccessChain) +LLPC_MODULE_PASS("lower-cfg-merges", LowerCfgMerges) LLPC_MODULE_PASS("llpc-spirv-lower-const-immediate-store", SpirvLowerConstImmediateStore) LLPC_MODULE_PASS("llpc-spirv-lower-cooperative-matrix", SpirvLowerCooperativeMatrix) LLPC_MODULE_PASS("llpc-spirv-lower-inst-meta-remove", SpirvLowerInstMetaRemove) LLPC_MODULE_PASS("llpc-spirv-lower-terminator", SpirvLowerTerminator) -LLPC_MODULE_PASS("llpc-spirv-lower-global", SpirvLowerGlobal) +LLPC_MODULE_PASS("lower-globals", LowerGlobals) LLPC_MODULE_PASS("llpc-spirv-lower-math-const-folding", SpirvLowerMathConstFolding) LLPC_MODULE_PASS("llpc-spirv-lower-math-precision", SpirvLowerMathPrecision) LLPC_MODULE_PASS("llpc-spirv-lower-math-float-op", SpirvLowerMathFloatOp) -LLPC_MODULE_PASS("llpc-spirv-lower-memory-op", SpirvLowerMemoryOp) +LLPC_MODULE_PASS("lower-memory-op", LowerMemoryOp) LLPC_MODULE_PASS("llpc-spirv-lower-ray-tracing", SpirvLowerRayTracing) LLPC_MODULE_PASS("lower-post-inline", LowerPostInline) diff --git a/llpc/lower/PrepareContinuations.cpp b/llpc/lower/PrepareContinuations.cpp index 0d54f87575..0599760a2b 100644 --- a/llpc/lower/PrepareContinuations.cpp +++ b/llpc/lower/PrepareContinuations.cpp @@ -29,9 +29,9 @@ *********************************************************************************************************************** */ #include "PrepareContinuations.h" -#include "compilerutils/CompilerUtils.h" #include "llpcContext.h" #include "llpcRayTracingContext.h" +#include "compilerutils/CompilerUtils.h" #include "llvmraytracing/ContinuationsUtil.h" #include "llvmraytracing/GpurtContext.h" #include "lgc/Builder.h" diff --git a/llpc/lower/ProcessGfxRuntimeLibrary.cpp b/llpc/lower/ProcessGfxRuntimeLibrary.cpp index 64add0fc3e..1fad88f7e4 100644 --- a/llpc/lower/ProcessGfxRuntimeLibrary.cpp +++ b/llpc/lower/ProcessGfxRuntimeLibrary.cpp @@ -29,11 +29,12 @@ *********************************************************************************************************************** */ #include "ProcessGfxRuntimeLibrary.h" +#include "LowerInternalLibraryIntrinsic.h" +#include "llpcSpirvLowerUtil.h" #include "compilerutils/ArgPromotion.h" #include "compilerutils/TypesMetadata.h" -#include "llpcSpirvLowerInternalLibraryIntrinsicUtil.h" -#include "llpcSpirvLowerUtil.h" #include "lgc/Builder.h" +#include "lgc/LgcDialect.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/IR/Module.h" @@ -65,9 +66,11 @@ PreservedAnalyses ProcessGfxRuntimeLibrary::run(Module &module, ModuleAnalysisMa // Initialize library function pointer table ProcessGfxRuntimeLibrary::LibraryFunctionTable::LibraryFunctionTable() { m_libFuncPtrs["AmdAdvancedBlendTexelLoad"] = &ProcessGfxRuntimeLibrary::createTexelLoad; - m_libFuncPtrs["AmdAdvancedBlendTexelLoadFmask"] = &ProcessGfxRuntimeLibrary::createTexelLoadFmask; + m_libFuncPtrs["AmdAdvancedBlendTexelLoadMsaa"] = &ProcessGfxRuntimeLibrary::createTexelLoadMsaa; m_libFuncPtrs["AmdAdvancedBlendCoherentTexelLoad"] = &ProcessGfxRuntimeLibrary::createCoherentTexelLoad; m_libFuncPtrs["AmdAdvancedBlendCoherentTexelStore"] = &ProcessGfxRuntimeLibrary::createCoherentTexelStore; + m_libFuncPtrs["AmdAdvancedBlendCoherentTexelLoadMsaa"] = &ProcessGfxRuntimeLibrary::createCoherentTexelLoadMsaa; + m_libFuncPtrs["AmdAdvancedBlendCoherentTexelStoreMsaa"] = &ProcessGfxRuntimeLibrary::createCoherentTexelStoreMsaa; } // ===================================================================================================================== @@ -113,28 +116,21 @@ void ProcessGfxRuntimeLibrary::processLibraryFunction(Function *&func) { // ===================================================================================================================== // Create texel load +// +// @param func : The function to process void ProcessGfxRuntimeLibrary::createTexelLoad(Function *func) { - // Arguments: imageDesc, icoord, lod - constexpr unsigned argCount = 3; - Type *int2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); - Type *argTypes[] = {m_builder->getInt64Ty(), int2Ty, m_builder->getInt32Ty()}; - std::array loadArgs; - for (unsigned i = 0; i < argCount; ++i) - loadArgs[i] = m_builder->CreateLoad(argTypes[i], func->getArg(i)); - unsigned imageFlag = Builder::ImageFlagInvariant | Builder::ImageFlagNotAliased; - loadArgs[0] = m_builder->CreateIntToPtr(loadArgs[0], PointerType::get(m_builder->getContext(), ADDR_SPACE_CONST)); - auto imageLoad = m_builder->CreateImageLoad(func->getReturnType(), Builder::Dim2D, imageFlag, loadArgs[0], - loadArgs[1], loadArgs[2]); - m_builder->CreateRet(imageLoad); + loadTexel(func, false, false); } // ===================================================================================================================== // Create texel load with fmask -void ProcessGfxRuntimeLibrary::createTexelLoadFmask(Function *func) { - // Argument: imageDescMs, fmaskDesc, icoord, lod +// +// @param func : The function to process +void ProcessGfxRuntimeLibrary::createTexelLoadMsaa(Function *func) { + // Argument: imageDescMs, fmaskDesc, icoord, sampleNum constexpr unsigned argCount = 4; - Type *int2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); - Type *argTypes[] = {m_builder->getInt64Ty(), m_builder->getInt64Ty(), int2Ty, m_builder->getInt32Ty()}; + Type *coordTy = FixedVectorType::get(m_builder->getInt32Ty(), 2); + Type *argTypes[] = {m_builder->getInt64Ty(), m_builder->getInt64Ty(), coordTy, m_builder->getInt32Ty()}; std::array loadArgs; for (unsigned i = 0; i < argCount; ++i) loadArgs[i] = m_builder->CreateLoad(argTypes[i], func->getArg(i)); @@ -148,32 +144,113 @@ void ProcessGfxRuntimeLibrary::createTexelLoadFmask(Function *func) { // ===================================================================================================================== // Create coherent texel Load +// +// @param func : The function to process void ProcessGfxRuntimeLibrary::createCoherentTexelLoad(Function *func) { - // Argument: inColor, icoord, sampleId + m_builder->create(); + loadTexel(func, false, true); +} + +// ===================================================================================================================== +// Create coherent texel store +// +// @param func : The function to process +void ProcessGfxRuntimeLibrary::createCoherentTexelStore(Function *func) { + storeTexel(func, false, true); + m_builder->create(); + m_builder->CreateRetVoid(); +} + +// ===================================================================================================================== +// Create coherent texel Load with multi-sampling +// +// @param func : The function to process +void ProcessGfxRuntimeLibrary::createCoherentTexelLoadMsaa(Function *func) { + m_builder->create(); + loadTexel(func, true, true); +} + +// ===================================================================================================================== +// Create coherent texel store with multi-sampling +// +// @param func : The function to process +void ProcessGfxRuntimeLibrary::createCoherentTexelStoreMsaa(Function *func) { + storeTexel(func, true, true); + m_builder->create(); + m_builder->CreateRetVoid(); +} + +// ===================================================================================================================== +// Perform texel load with or without ROV supported +// +// @param func : The function to process +// @param isMsaa : Whether it is multi-sampling +// @param enableRov : Whether ROV is enabled +void ProcessGfxRuntimeLibrary::loadTexel(Function *func, bool isMsaa, bool enableRov) { + // Argument: desc, icoord, sampleId constexpr unsigned argCount = 3; - Type *Float4Ty = FixedVectorType::get(m_builder->getFloatTy(), 4); + unsigned coordCount = enableRov ? 3 : 2; + Type *coordTy = FixedVectorType::get(m_builder->getInt32Ty(), coordCount); + Value *coord = PoisonValue::get(coordTy); + Type *int2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); - Type *argTypes[] = {Float4Ty, int2Ty, m_builder->getInt32Ty()}; + Type *argTypes[] = {m_builder->getInt64Ty(), int2Ty, m_builder->getInt32Ty()}; std::array loadArgs; for (unsigned i = 0; i < argCount; ++i) loadArgs[i] = m_builder->CreateLoad(argTypes[i], func->getArg(i)); - // TODO: Implement load texel based on ROV - m_builder->CreateRet(loadArgs[0]); + + unsigned dim = isMsaa ? Builder::Dim2DMsaa : Builder::Dim2D; + unsigned imageFlag = Builder::ImageFlagInvariant | Builder::ImageFlagNotAliased | Builder::ImageFlagCoherent; + loadArgs[0] = m_builder->CreateIntToPtr(loadArgs[0], PointerType::get(m_builder->getContext(), ADDR_SPACE_CONST)); + if (enableRov) { + // (icood.x, icoord.y, icoord.z) = (loadArgs[1].x, loadArgs[1].y, sampleId) + coord = m_builder->CreateInsertElement(coord, m_builder->CreateExtractElement(loadArgs[1], m_builder->getInt32(0)), + static_cast(0)); + coord = + m_builder->CreateInsertElement(coord, m_builder->CreateExtractElement(loadArgs[1], m_builder->getInt32(1)), 1); + coord = m_builder->CreateInsertElement(coord, loadArgs[2], 2); + } else { + coord = loadArgs[1]; + } + + auto imageLoad = m_builder->CreateImageLoad(func->getReturnType(), dim, imageFlag, loadArgs[0], coord, nullptr); + m_builder->CreateRet(imageLoad); } // ===================================================================================================================== -// Create coherent texel store -void ProcessGfxRuntimeLibrary::createCoherentTexelStore(Function *func) { - // Argument: inColor, icoord, sampleId - constexpr unsigned argCount = 3; - Type *Float4Ty = FixedVectorType::get(m_builder->getFloatTy(), 4); +// Perform texel store with or without ROV supported +// +// @param func : The function to process +// @param isMsaa : Whether it is multi-sampling +// @param enableRov : Whether ROV is enabled +void ProcessGfxRuntimeLibrary::storeTexel(Function *func, bool isMsaa, bool enableRov) { + // Argument: texel, desc, icoord, sampleId + constexpr unsigned argCount = 4; + unsigned coordCount = enableRov ? 3 : 2; + Type *coordTy = FixedVectorType::get(m_builder->getInt32Ty(), coordCount); + Value *coord = PoisonValue::get(coordTy); + Type *texelTy = FixedVectorType::get(m_builder->getFloatTy(), 4); + Type *int2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); - Type *argTypes[] = {Float4Ty, int2Ty, m_builder->getInt32Ty()}; - std::array storeArgs; + Type *argTypes[] = {texelTy, m_builder->getInt64Ty(), int2Ty, m_builder->getInt32Ty()}; + std::array loadArgs; for (unsigned i = 0; i < argCount; ++i) - storeArgs[i] = m_builder->CreateLoad(argTypes[i], func->getArg(i)); - // TODO: Implement store texel based on ROV - m_builder->CreateRetVoid(); + loadArgs[i] = m_builder->CreateLoad(argTypes[i], func->getArg(i)); + + unsigned dim = isMsaa ? Builder::Dim2DMsaa : Builder::Dim2D; + unsigned imageFlag = Builder::ImageFlagInvariant | Builder::ImageFlagNotAliased | Builder::ImageFlagCoherent; + loadArgs[1] = m_builder->CreateIntToPtr(loadArgs[1], PointerType::get(m_builder->getContext(), ADDR_SPACE_CONST)); + if (enableRov) { + // (icood.x, icoord.y, icoord.z) = (loadArgs[2].x, loadArgs[2].y, sampleId) + coord = m_builder->CreateInsertElement(coord, m_builder->CreateExtractElement(loadArgs[2], m_builder->getInt32(0)), + static_cast(0)); + coord = + m_builder->CreateInsertElement(coord, m_builder->CreateExtractElement(loadArgs[2], m_builder->getInt32(1)), 1); + coord = m_builder->CreateInsertElement(coord, loadArgs[3], 2); + } else { + coord = loadArgs[2]; + } + m_builder->CreateImageStore(loadArgs[0], dim, imageFlag, loadArgs[1], coord, nullptr); } } // namespace Llpc diff --git a/llpc/lower/ProcessGfxRuntimeLibrary.h b/llpc/lower/ProcessGfxRuntimeLibrary.h index 19a48861c6..91322eb570 100644 --- a/llpc/lower/ProcessGfxRuntimeLibrary.h +++ b/llpc/lower/ProcessGfxRuntimeLibrary.h @@ -51,8 +51,13 @@ class ProcessGfxRuntimeLibrary : public SpirvLower, public llvm::PassInfoMixin

= 50 + m_libFuncPtrs["AmdTraceRayInitStaticId"] = &ProcessGpuRtLibrary::createInitStaticId; +#endif + m_libFuncPtrs["AmdTraceRayGetKnownSetRayFlags"] = &ProcessGpuRtLibrary::createGetKnownSetRayFlags; + m_libFuncPtrs["AmdTraceRayGetKnownUnsetRayFlags"] = &ProcessGpuRtLibrary::createGetKnownUnsetRayFlags; + m_libFuncPtrs["_AmdContStackAlloc"] = &ProcessGpuRtLibrary::createContStackAlloc; + m_libFuncPtrs["_AmdContStackFree"] = &ProcessGpuRtLibrary::createContStackFree; + m_libFuncPtrs["_AmdContStackGetPtr"] = &ProcessGpuRtLibrary::createContStackGetPtr; + m_libFuncPtrs["_AmdContStackSetPtr"] = &ProcessGpuRtLibrary::createContStackSetPtr; + m_libFuncPtrs["_AmdContinuationStackIsGlobal"] = &ProcessGpuRtLibrary::createContinuationStackIsGlobal; + m_libFuncPtrs["_AmdGetRtip"] = &ProcessGpuRtLibrary::createGetRtip; + m_libFuncPtrs["_AmdIsLlpc"] = &ProcessGpuRtLibrary::createIsLlpc; } // ===================================================================================================================== // Clear the block before patching the function // // @param func : The function to process -void SpirvProcessGpuRtLibrary::processLibraryFunction(Function *&func) { +void ProcessGpuRtLibrary::processLibraryFunction(Function *&func) { auto funcName = func->getName(); // Special handling for _AmdContStackStore* and _AmdContStackLoad* to accept arbitrary type @@ -277,6 +278,9 @@ void SpirvProcessGpuRtLibrary::processLibraryFunction(Function *&func) { else ContHelper::handleValueSetI32(*newFunc, *m_builder); return; + } else if (funcName.starts_with("_AmdComplete")) { + ContHelper::handleComplete(*func); + return; } // Create implementation for intrinsic functions. @@ -360,7 +364,7 @@ void SpirvProcessGpuRtLibrary::processLibraryFunction(Function *&func) { // Fill in function to get stack size // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createGetStackSize(Function *func) { +void ProcessGpuRtLibrary::createGetStackSize(Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -368,7 +372,7 @@ void SpirvProcessGpuRtLibrary::createGetStackSize(Function *func) { // Fill in function to get stack base // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createGetStackBase(Function *func) { +void ProcessGpuRtLibrary::createGetStackBase(Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -376,7 +380,7 @@ void SpirvProcessGpuRtLibrary::createGetStackBase(Function *func) { // Fill in function to write LDS stack // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createLdsWrite(Function *func) { +void ProcessGpuRtLibrary::createLdsWrite(Function *func) { auto argIt = func->arg_begin(); auto int32ty = m_builder->getInt32Ty(); Value *stackOffset = m_builder->CreateLoad(int32ty, argIt++); @@ -388,7 +392,7 @@ void SpirvProcessGpuRtLibrary::createLdsWrite(Function *func) { // Fill in function to read LDS stack // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createLdsRead(Function *func) { +void ProcessGpuRtLibrary::createLdsRead(Function *func) { Value *stackIndex = func->arg_begin(); stackIndex = m_builder->CreateLoad(m_builder->getInt32Ty(), stackIndex); m_builder->CreateRet(m_builder->create(stackIndex, false)); @@ -398,7 +402,7 @@ void SpirvProcessGpuRtLibrary::createLdsRead(Function *func) { // Fill in function to get stack stride // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createGetStackStride(Function *func) { +void ProcessGpuRtLibrary::createGetStackStride(Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -406,12 +410,12 @@ void SpirvProcessGpuRtLibrary::createGetStackStride(Function *func) { // Fill in function to init stack LDS // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createLdsStackInit(Function *func) { +void ProcessGpuRtLibrary::createLdsStackInit(Function *func) { m_builder->CreateRet(m_builder->create(false)); } // ===================================================================================================================== -void SpirvProcessGpuRtLibrary::createFloatOpWithRoundMode(llvm::Function *func) { +void ProcessGpuRtLibrary::createFloatOpWithRoundMode(llvm::Function *func) { auto argIt = func->arg_begin(); auto retType = cast(func->getReturnType()); auto int32Ty = m_builder->getInt32Ty(); @@ -426,7 +430,7 @@ void SpirvProcessGpuRtLibrary::createFloatOpWithRoundMode(llvm::Function *func) // Fill in function to store stack LDS // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createLdsStackStore(Function *func) { +void ProcessGpuRtLibrary::createLdsStackStore(Function *func) { auto argIt = func->arg_begin(); Value *stackAddr = argIt++; Value *stackAddrPos = m_builder->CreateLoad(m_builder->getInt32Ty(), stackAddr); @@ -443,7 +447,7 @@ void SpirvProcessGpuRtLibrary::createLdsStackStore(Function *func) { // Fill in function to get box sort heuristic mode // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createGetBoxSortHeuristicMode(Function *func) { +void ProcessGpuRtLibrary::createGetBoxSortHeuristicMode(Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -451,7 +455,7 @@ void SpirvProcessGpuRtLibrary::createGetBoxSortHeuristicMode(Function *func) { // Fill in function to get static flags // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createGetStaticFlags(Function *func) { +void ProcessGpuRtLibrary::createGetStaticFlags(Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -459,7 +463,7 @@ void SpirvProcessGpuRtLibrary::createGetStaticFlags(Function *func) { // Fill in function to get triangle compression mode // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createGetTriangleCompressionMode(Function *func) { +void ProcessGpuRtLibrary::createGetTriangleCompressionMode(Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -467,7 +471,7 @@ void SpirvProcessGpuRtLibrary::createGetTriangleCompressionMode(Function *func) // Fill in function to global load 1 dword at given address // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createLoadDwordAtAddr(Function *func) { +void ProcessGpuRtLibrary::createLoadDwordAtAddr(Function *func) { createLoadDwordAtAddrWithType(func, m_builder->getInt32Ty(), SPIRAS_Global); } @@ -475,7 +479,7 @@ void SpirvProcessGpuRtLibrary::createLoadDwordAtAddr(Function *func) { // Fill in function to global load 2 dwords at given address // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createLoadDwordAtAddrx2(Function *func) { +void ProcessGpuRtLibrary::createLoadDwordAtAddrx2(Function *func) { auto int32x2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); createLoadDwordAtAddrWithType(func, int32x2Ty, SPIRAS_Global); } @@ -484,7 +488,7 @@ void SpirvProcessGpuRtLibrary::createLoadDwordAtAddrx2(Function *func) { // Fill in function to global load 4 dwords at given address // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createLoadDwordAtAddrx4(Function *func) { +void ProcessGpuRtLibrary::createLoadDwordAtAddrx4(Function *func) { auto int32x4Ty = FixedVectorType::get(m_builder->getInt32Ty(), 4); createLoadDwordAtAddrWithType(func, int32x4Ty, SPIRAS_Global); } @@ -493,7 +497,7 @@ void SpirvProcessGpuRtLibrary::createLoadDwordAtAddrx4(Function *func) { // Fill in function to constant load 1 dword at given address // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createConstantLoadDwordAtAddr(Function *func) { +void ProcessGpuRtLibrary::createConstantLoadDwordAtAddr(Function *func) { createLoadDwordAtAddrWithType(func, m_builder->getInt32Ty(), SPIRAS_Constant); } @@ -501,7 +505,7 @@ void SpirvProcessGpuRtLibrary::createConstantLoadDwordAtAddr(Function *func) { // Fill in function to constant load 2 dwords at given address // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createConstantLoadDwordAtAddrx2(Function *func) { +void ProcessGpuRtLibrary::createConstantLoadDwordAtAddrx2(Function *func) { auto int32x2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); createLoadDwordAtAddrWithType(func, int32x2Ty, SPIRAS_Constant); } @@ -510,7 +514,7 @@ void SpirvProcessGpuRtLibrary::createConstantLoadDwordAtAddrx2(Function *func) { // Fill in function to constant load 4 dwords at given address // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createConstantLoadDwordAtAddrx4(Function *func) { +void ProcessGpuRtLibrary::createConstantLoadDwordAtAddrx4(Function *func) { auto int32x4Ty = FixedVectorType::get(m_builder->getInt32Ty(), 4); createLoadDwordAtAddrWithType(func, int32x4Ty, SPIRAS_Constant); } @@ -520,8 +524,7 @@ void SpirvProcessGpuRtLibrary::createConstantLoadDwordAtAddrx4(Function *func) { // // @param func : The function to process // @param loadTy : Load type -void SpirvProcessGpuRtLibrary::createLoadDwordAtAddrWithType(Function *func, Type *loadTy, - SPIRAddressSpace addressSpace) { +void ProcessGpuRtLibrary::createLoadDwordAtAddrWithType(Function *func, Type *loadTy, SPIRAddressSpace addressSpace) { auto argIt = func->arg_begin(); Value *gpuLowAddr = m_builder->CreateLoad(m_builder->getInt32Ty(), argIt++); @@ -548,7 +551,7 @@ void SpirvProcessGpuRtLibrary::createLoadDwordAtAddrWithType(Function *func, Typ // Fill in function to convert f32 to f16 with rounding toward negative // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createConvertF32toF16NegInf(Function *func) { +void ProcessGpuRtLibrary::createConvertF32toF16NegInf(Function *func) { createConvertF32toF16WithRoundingMode(func, RoundingMode::TowardNegative); } @@ -556,7 +559,7 @@ void SpirvProcessGpuRtLibrary::createConvertF32toF16NegInf(Function *func) { // Fill in function to convert f32 to f16 with rounding toward positive // // @param func : The function to process -void SpirvProcessGpuRtLibrary::createConvertF32toF16PosInf(Function *func) { +void ProcessGpuRtLibrary::createConvertF32toF16PosInf(Function *func) { createConvertF32toF16WithRoundingMode(func, RoundingMode::TowardPositive); } @@ -565,7 +568,7 @@ void SpirvProcessGpuRtLibrary::createConvertF32toF16PosInf(Function *func) { // // @param func : The function to process // @param rm : Rounding mode -void SpirvProcessGpuRtLibrary::createConvertF32toF16WithRoundingMode(Function *func, RoundingMode rm) { +void ProcessGpuRtLibrary::createConvertF32toF16WithRoundingMode(Function *func, RoundingMode rm) { auto argIt = func->arg_begin(); Type *convertInputType = FixedVectorType::get(m_builder->getFloatTy(), 3); @@ -583,7 +586,7 @@ void SpirvProcessGpuRtLibrary::createConvertF32toF16WithRoundingMode(Function *f // Fill in function to return bvh node intersection result // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createIntersectBvh(Function *func) { +void ProcessGpuRtLibrary::createIntersectBvh(Function *func) { const auto *rtState = m_context->getPipelineContext()->getRayTracingState(); assert(rtState->bvhResDesc.dataSizeInDwords != 0); if (rtState->bvhResDesc.dataSizeInDwords < 4) @@ -648,7 +651,7 @@ void SpirvProcessGpuRtLibrary::createIntersectBvh(Function *func) { // // @param expansion : Box expansion // @param boxSortMode : Box sort mode -Value *SpirvProcessGpuRtLibrary::createGetBvhSrd(llvm::Value *expansion, llvm::Value *boxSortMode) { +Value *ProcessGpuRtLibrary::createGetBvhSrd(llvm::Value *expansion, llvm::Value *boxSortMode) { const auto *rtState = m_context->getPipelineContext()->getRayTracingState(); assert(rtState->bvhResDesc.dataSizeInDwords == 4); @@ -695,7 +698,7 @@ Value *SpirvProcessGpuRtLibrary::createGetBvhSrd(llvm::Value *expansion, llvm::V // Fill in function to sample gpu timer // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createSampleGpuTimer(llvm::Function *func) { +void ProcessGpuRtLibrary::createSampleGpuTimer(llvm::Function *func) { if (func->arg_size() == 2) { Value *timerHiPtr = func->getArg(0); Value *timerLoPtr = func->getArg(1); @@ -721,7 +724,7 @@ void SpirvProcessGpuRtLibrary::createSampleGpuTimer(llvm::Function *func) { // Fill in function to get flattened group thread ID // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createGetFlattenedGroupThreadId(llvm::Function *func) { +void ProcessGpuRtLibrary::createGetFlattenedGroupThreadId(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -729,7 +732,7 @@ void SpirvProcessGpuRtLibrary::createGetFlattenedGroupThreadId(llvm::Function *f // Fill in function to get hit attributes // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createGetHitAttributes(llvm::Function *func) { +void ProcessGpuRtLibrary::createGetHitAttributes(llvm::Function *func) { Value *tCurrentPtr = func->getArg(0); Value *kindPtr = func->getArg(1); Value *statusPtr = func->getArg(2); @@ -741,7 +744,7 @@ void SpirvProcessGpuRtLibrary::createGetHitAttributes(llvm::Function *func) { // Fill in function to set hit attributes // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createSetHitAttributes(llvm::Function *func) { +void ProcessGpuRtLibrary::createSetHitAttributes(llvm::Function *func) { Value *tCurrent = m_builder->CreateLoad(m_builder->getFloatTy(), func->getArg(0)); Value *kind = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(1)); Value *status = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(2)); @@ -759,7 +762,7 @@ void SpirvProcessGpuRtLibrary::createSetHitAttributes(llvm::Function *func) { // Fill in function to set trace parameters // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createSetTraceParams(llvm::Function *func) { +void ProcessGpuRtLibrary::createSetTraceParams(llvm::Function *func) { Value *rayFlags = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); Value *instanceInclusionMask = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(1)); Value *originX = m_builder->CreateLoad(m_builder->getFloatTy(), func->getArg(2)); @@ -778,7 +781,7 @@ void SpirvProcessGpuRtLibrary::createSetTraceParams(llvm::Function *func) { // Fill in function to call closest-hit shader // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createCallClosestHitShader(llvm::Function *func) { +void ProcessGpuRtLibrary::createCallClosestHitShader(llvm::Function *func) { Value *shaderId = m_builder->CreateLoad(FixedVectorType::get(m_builder->getInt32Ty(), 2), func->getArg(0)); Value *tableIndex = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(1)); m_builder->CreateRet(m_builder->create(shaderId, tableIndex)); @@ -788,7 +791,7 @@ void SpirvProcessGpuRtLibrary::createCallClosestHitShader(llvm::Function *func) // Fill in function to call miss shader // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createCallMissShader(llvm::Function *func) { +void ProcessGpuRtLibrary::createCallMissShader(llvm::Function *func) { Value *shaderId = m_builder->CreateLoad(FixedVectorType::get(m_builder->getInt32Ty(), 2), func->getArg(0)); Value *tableIndex = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(1)); m_builder->CreateRet(m_builder->create(shaderId, tableIndex)); @@ -798,7 +801,7 @@ void SpirvProcessGpuRtLibrary::createCallMissShader(llvm::Function *func) { // Fill in function to call triangle any-hit shader // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createCallTriangleAnyHitShader(llvm::Function *func) { +void ProcessGpuRtLibrary::createCallTriangleAnyHitShader(llvm::Function *func) { Value *shaderId = m_builder->CreateLoad(FixedVectorType::get(m_builder->getInt32Ty(), 2), func->getArg(0)); Value *tableIndex = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(1)); @@ -813,7 +816,7 @@ void SpirvProcessGpuRtLibrary::createCallTriangleAnyHitShader(llvm::Function *fu // Fill in function to call intersection shader // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createCallIntersectionShader(llvm::Function *func) { +void ProcessGpuRtLibrary::createCallIntersectionShader(llvm::Function *func) { Value *shaderId = m_builder->CreateLoad(FixedVectorType::get(m_builder->getInt32Ty(), 2), func->getArg(0)); Value *anyHitShaderId = m_builder->CreateLoad(FixedVectorType::get(m_builder->getInt32Ty(), 2), func->getArg(1)); Value *tableIndex = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(2)); @@ -825,7 +828,7 @@ void SpirvProcessGpuRtLibrary::createCallIntersectionShader(llvm::Function *func // Fill in function to set triangle intersection attributes // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createSetTriangleIntersectionAttributes(llvm::Function *func) { +void ProcessGpuRtLibrary::createSetTriangleIntersectionAttributes(llvm::Function *func) { Value *barycentrics = m_builder->CreateLoad(FixedVectorType::get(m_builder->getFloatTy(), 2), func->getArg(0)); m_builder->create(barycentrics); m_builder->CreateRetVoid(); @@ -835,7 +838,7 @@ void SpirvProcessGpuRtLibrary::createSetTriangleIntersectionAttributes(llvm::Fun // Fill in function to set hit triangle node pointer // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createSetHitTriangleNodePointer(llvm::Function *func) { +void ProcessGpuRtLibrary::createSetHitTriangleNodePointer(llvm::Function *func) { Value *bvhAddress = m_builder->CreateLoad(m_builder->getInt64Ty(), func->getArg(0)); Value *nodePointer = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(1)); m_builder->create(bvhAddress, nodePointer); @@ -846,7 +849,7 @@ void SpirvProcessGpuRtLibrary::createSetHitTriangleNodePointer(llvm::Function *f // Fill in function to get parent ID // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createGetParentId(llvm::Function *func) { +void ProcessGpuRtLibrary::createGetParentId(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -854,7 +857,7 @@ void SpirvProcessGpuRtLibrary::createGetParentId(llvm::Function *func) { // Fill in function to get set parent ID // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createSetParentId(llvm::Function *func) { +void ProcessGpuRtLibrary::createSetParentId(llvm::Function *func) { Value *rayId = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); m_builder->create(rayId); m_builder->CreateRetVoid(); @@ -864,7 +867,7 @@ void SpirvProcessGpuRtLibrary::createSetParentId(llvm::Function *func) { // Fill in function to get dispatch ray index // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createDispatchRayIndex(llvm::Function *func) { +void ProcessGpuRtLibrary::createDispatchRayIndex(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -872,15 +875,23 @@ void SpirvProcessGpuRtLibrary::createDispatchRayIndex(llvm::Function *func) { // Fill in function to get ray static ID // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createGetStaticId(llvm::Function *func) { +void ProcessGpuRtLibrary::createGetStaticId(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } +// ===================================================================================================================== +// Fill in function to initialize ray static ID +// +// @param func : The function to create +void ProcessGpuRtLibrary::createInitStaticId(llvm::Function *func) { + m_builder->CreateRet(m_builder->create()); +} + // ===================================================================================================================== // Fill in function to get known set ray flags // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createGetKnownSetRayFlags(llvm::Function *func) { +void ProcessGpuRtLibrary::createGetKnownSetRayFlags(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -888,7 +899,7 @@ void SpirvProcessGpuRtLibrary::createGetKnownSetRayFlags(llvm::Function *func) { // Fill in function to get known unset ray flags // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createGetKnownUnsetRayFlags(llvm::Function *func) { +void ProcessGpuRtLibrary::createGetKnownUnsetRayFlags(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -896,14 +907,14 @@ void SpirvProcessGpuRtLibrary::createGetKnownUnsetRayFlags(llvm::Function *func) // Fill in function of AmdExtDispatchThreadIdFlat // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createDispatchThreadIdFlat(llvm::Function *func) { +void ProcessGpuRtLibrary::createDispatchThreadIdFlat(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } // ===================================================================================================================== // Fill in function to allocate continuation stack pointer // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createContStackAlloc(llvm::Function *func) { +void ProcessGpuRtLibrary::createContStackAlloc(llvm::Function *func) { assert(func->arg_size() == 1); Value *byteSize = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); auto stackPtr = m_builder->create(byteSize); @@ -914,7 +925,7 @@ void SpirvProcessGpuRtLibrary::createContStackAlloc(llvm::Function *func) { // Fill in function to free continuation stack pointer // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createContStackFree(llvm::Function *func) { +void ProcessGpuRtLibrary::createContStackFree(llvm::Function *func) { Value *byteSize = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); m_builder->create(byteSize); m_builder->CreateRetVoid(); @@ -924,7 +935,7 @@ void SpirvProcessGpuRtLibrary::createContStackFree(llvm::Function *func) { // Fill in function to get continuation stack pointer // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createContStackGetPtr(llvm::Function *func) { +void ProcessGpuRtLibrary::createContStackGetPtr(llvm::Function *func) { auto stackPtr = m_builder->create(); m_builder->CreateRet(m_builder->CreatePtrToInt(stackPtr, m_builder->getInt32Ty())); } @@ -933,7 +944,7 @@ void SpirvProcessGpuRtLibrary::createContStackGetPtr(llvm::Function *func) { // Fill in function to set continuation stack pointer // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createContStackSetPtr(llvm::Function *func) { +void ProcessGpuRtLibrary::createContStackSetPtr(llvm::Function *func) { auto csp = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); m_builder->create(m_builder->CreateIntToPtr(csp, m_builder->getPtrTy(cps::stackAddrSpace))); m_builder->CreateRetVoid(); @@ -943,7 +954,7 @@ void SpirvProcessGpuRtLibrary::createContStackSetPtr(llvm::Function *func) { // Fill in function to load from given continuation stack address // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createContStackLoad(llvm::Function *func) { +void ProcessGpuRtLibrary::createContStackLoad(llvm::Function *func) { auto loadTy = func->getReturnType(); auto addr = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); auto ptr = m_builder->CreateIntToPtr(addr, m_builder->getPtrTy(cps::stackAddrSpace)); @@ -954,7 +965,7 @@ void SpirvProcessGpuRtLibrary::createContStackLoad(llvm::Function *func) { // Fill in function to store to given continuation stack address // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createContStackStore(llvm::Function *func) { +void ProcessGpuRtLibrary::createContStackStore(llvm::Function *func) { unsigned dataArgIndex = func->arg_size() - 1; Type *dataType = getFuncArgPtrElementType(func, dataArgIndex); @@ -969,7 +980,7 @@ void SpirvProcessGpuRtLibrary::createContStackStore(llvm::Function *func) { // Fill in function to enqueue shader // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createEnqueue(Function *func) { +void ProcessGpuRtLibrary::createEnqueue(Function *func) { auto funcName = func->getName(); Value *addr = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); @@ -978,7 +989,7 @@ void SpirvProcessGpuRtLibrary::createEnqueue(Function *func) { bool hasWaitMaskArg = funcName.contains("Wait"); // Skip waitMask unsigned retAddrArgIdx = hasWaitMaskArg ? 2 : 1; - tailArgs.push_back(m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(retAddrArgIdx))); + Value *retAddr = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(retAddrArgIdx)); // Get shader-index from system-data. unsigned systemDataArgIdx = retAddrArgIdx + 1; tailArgs.push_back(m_builder->CreateNamedCall("_cont_GetLocalRootIndex", m_builder->getInt32Ty(), @@ -991,14 +1002,14 @@ void SpirvProcessGpuRtLibrary::createEnqueue(Function *func) { } // TODO: pass the levelMask correctly. - m_builder->create(addr, -1, PoisonValue::get(StructType::get(*m_context, {})), tailArgs); + m_builder->create(addr, -1, PoisonValue::get(StructType::get(*m_context, {})), retAddr, tailArgs); m_builder->CreateUnreachable(); } // Fill in function to check whether continuation stack is global // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createContinuationStackIsGlobal(llvm::Function *func) { +void ProcessGpuRtLibrary::createContinuationStackIsGlobal(llvm::Function *func) { m_builder->CreateRet(m_builder->create()); } @@ -1006,17 +1017,26 @@ void SpirvProcessGpuRtLibrary::createContinuationStackIsGlobal(llvm::Function *f // Fill in function to get RTIP // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createGetRtip(llvm::Function *func) { +void ProcessGpuRtLibrary::createGetRtip(llvm::Function *func) { auto rtip = m_context->getPipelineContext()->getRayTracingState()->rtIpVersion; // The version is encoded as in decimal digits, so 11 is rtip 1.1, 20 is rtip 2.0 m_builder->CreateRet(m_builder->getInt32(rtip.major * 10 + rtip.minor)); } +// ===================================================================================================================== +// Fill in function to tell GPURT it is compiled from LLPC +// +// @param func : The function to create +void ProcessGpuRtLibrary::createIsLlpc(llvm::Function *func) { + auto *trueConst = ConstantInt::getTrue(func->getContext()); + m_builder->CreateRet(trueConst); +} + // ===================================================================================================================== // Fill in function to write shader marker // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createShaderMarker(llvm::Function *func) { +void ProcessGpuRtLibrary::createShaderMarker(llvm::Function *func) { Value *dataPtr = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0)); m_builder->CreateIntrinsic(Intrinsic::amdgcn_s_ttracedata, {}, dataPtr); m_builder->CreateRetVoid(); @@ -1026,7 +1046,7 @@ void SpirvProcessGpuRtLibrary::createShaderMarker(llvm::Function *func) { // Fill in function to write wave scan // // @param func : The function to create -void SpirvProcessGpuRtLibrary::createWaveScan(llvm::Function *func) { +void ProcessGpuRtLibrary::createWaveScan(llvm::Function *func) { auto argIt = func->arg_begin(); auto retType = cast(func->getReturnType()); auto int32Ty = m_builder->getInt32Ty(); diff --git a/llpc/lower/llpcSpirvProcessGpuRtLibrary.h b/llpc/lower/ProcessGpuRtLibrary.h similarity index 91% rename from llpc/lower/llpcSpirvProcessGpuRtLibrary.h rename to llpc/lower/ProcessGpuRtLibrary.h index 7b0725c0b5..30fe5a5ca5 100644 --- a/llpc/lower/llpcSpirvProcessGpuRtLibrary.h +++ b/llpc/lower/ProcessGpuRtLibrary.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file llpcSpirvProcessGpuRtLibrary.h - * @brief LLPC header file: contains declaration of Llpc::SpirvProcessGpuRtLibrary + * @file ProcessGpuRtLibrary.h + * @brief LLPC header file: contains declaration of Llpc::ProcessGpuRtLibrary *********************************************************************************************************************** */ #pragma once @@ -35,13 +35,13 @@ #include "llvm/IR/PassManager.h" namespace Llpc { -class SpirvProcessGpuRtLibrary : public SpirvLower, public llvm::PassInfoMixin { +class ProcessGpuRtLibrary : public SpirvLower, public llvm::PassInfoMixin { public: - SpirvProcessGpuRtLibrary(); + ProcessGpuRtLibrary(); llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); private: - typedef void (SpirvProcessGpuRtLibrary::*LibraryFuncPtr)(llvm::Function *); + typedef void (ProcessGpuRtLibrary::*LibraryFuncPtr)(llvm::Function *); struct LibraryFunctionTable { llvm::DenseMap m_libFuncPtrs; LibraryFunctionTable(); @@ -88,6 +88,7 @@ class SpirvProcessGpuRtLibrary : public SpirvLower, public llvm::PassInfoMixin (...) @lgc.create.reflect.v4f32(<4 x float> +; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results +; SHADERTEST: = fmul nnan nsz arcp contract afn float +; SHADERTEST-NEXT: = fmul nnan nsz arcp contract afn float +; SHADERTEST-NEXT: = fmul nnan nsz arcp contract afn float +; SHADERTEST-NEXT: = fsub nnan nsz arcp contract afn float ; SHADERTEST: AMDLLPC SUCCESS */ // END_SHADERTEST diff --git a/llpc/test/shaderdb/general/TestNumComponentsWithReversedAccessOrder.mesh b/llpc/test/shaderdb/general/TestNumComponentsWithReversedAccessOrder.mesh index bbcacc3da0..a9232b340b 100644 --- a/llpc/test/shaderdb/general/TestNumComponentsWithReversedAccessOrder.mesh +++ b/llpc/test/shaderdb/general/TestNumComponentsWithReversedAccessOrder.mesh @@ -33,7 +33,7 @@ void main() { data1[gl_LocalInvocationIndex].z = 0.3; data1[gl_LocalInvocationIndex].y = 0.2; data1[gl_LocalInvocationIndex].x = 0.1; - + data2[gl_LocalInvocationIndex].z = -0.3; data2[gl_LocalInvocationIndex].y = -0.2; data2[gl_LocalInvocationIndex].x = -0.1; diff --git a/llpc/test/shaderdb/general/UndefVertexOutput.spvasm b/llpc/test/shaderdb/general/UndefVertexOutput.spvasm index 83197a426f..7f5c983ad2 100644 --- a/llpc/test/shaderdb/general/UndefVertexOutput.spvasm +++ b/llpc/test/shaderdb/general/UndefVertexOutput.spvasm @@ -86,15 +86,15 @@ ; CHECK-NEXT: s_getpc_b64 s[2:3] ; CHECK-NEXT: v_add_nc_u32_e32 v0, s10, v5 ; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v4, 1.0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x10 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v5, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: exp pos0 v0, v1, v2, v3 done -; CHECK-NEXT: exp param7 v5, v5, v4, v4 -; CHECK-NEXT: exp param9 off, v5, off, off -; CHECK-NEXT: exp param8 v4, v5, v5, v4 +; CHECK-NEXT: exp param2 v5, v4, v4, v5 +; CHECK-NEXT: exp param1 v4, v4, v5, v5 +; CHECK-NEXT: exp param3 off, v4, off, off ; CHECK-NEXT: .LBB0_6: ; CHECK-NEXT: s_endpgm diff --git a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe index d75fa4f7bd..c34b187492 100644 --- a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe +++ b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe @@ -278,8 +278,6 @@ colorBuffer[0].blendSrcAlphaToColor = 0 ; CHECK-NEXT: .vgt_gs_out_prim_type: ; CHECK-NEXT: .outprim_type: PointList ; CHECK-NEXT: .vgt_gs_per_vs: 0x2 -; CHECK-NEXT: .vgt_gs_vert_itemsize: 0 -; CHECK-NEXT: .vgt_gsvs_ring_itemsize: 0 ; CHECK-NEXT: .vgt_reuse_off: false ; CHECK-NEXT: .vgt_shader_stages_en: ; CHECK-NEXT: .es_stage_en: 0x2 diff --git a/llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.pipe b/llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.pipe new file mode 100644 index 0000000000..4e266cabc9 --- /dev/null +++ b/llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.pipe @@ -0,0 +1,108 @@ +// BEGIN_SHADERTEST +/* +; RUN: amdllpc --print-after=lgc-lower-gpurt %gfxip 2>&1 %s | FileCheck -check-prefix=SHADERTEST %s +; Check whether ray flags are replaced by known values +; SHADERTEST: %{{.*}} = xor i32 -171, -1 +; SHADERTEST: %{{.*}} = or i32 %{{.*}}, 170 +*/ +// END_SHADERTEST + +[Version] +version = 74 + +[rgenGlsl] + +#version 460 +#extension GL_EXT_ray_tracing : require + +struct RayPayload { + vec3 color; +}; + +layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh; +layout(location = 0) rayPayloadEXT RayPayload g_ray; + +void main() { + vec3 origin; + origin.x = gl_LaunchIDEXT.x; + origin.y = gl_LaunchIDEXT.y; + origin.z = 0; + + traceRayEXT(g_bvh, /* ray flags */ 0xAA, /* cull mask */ 0xff, + /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0, + origin.xyz, /* tmin */ 0.0, /* direction */ vec3(1, 0, 0), + /* tmax */ 48.0, /* payload location */ 0); +} + +[rgenInfo] +entryPoint = main + +[ResourceMapping] +userDataNode[0].visibility = 0xffffffff +userDataNode[0].type = DescriptorTableVaPtr +userDataNode[0].offsetInDwords = 0 +userDataNode[0].sizeInDwords = 1 +userDataNode[0].next[0].type = DescriptorConstBuffer +userDataNode[0].next[0].offsetInDwords = 0 +userDataNode[0].next[0].sizeInDwords = 4 +userDataNode[0].next[0].set = 0x00000000 +userDataNode[0].next[0].binding = 0 +userDataNode[0].next[1].type = DescriptorImage +userDataNode[0].next[1].offsetInDwords = 4 +userDataNode[0].next[1].sizeInDwords = 8 +userDataNode[0].next[1].set = 0x00000000 +userDataNode[0].next[1].binding = 1 +userDataNode[1].visibility = 0xffffffff +userDataNode[1].type = DescriptorTableVaPtr +userDataNode[1].offsetInDwords = 1 +userDataNode[1].sizeInDwords = 1 +userDataNode[1].next[0].type = DescriptorConstBufferCompact +userDataNode[1].next[0].offsetInDwords = 0 +userDataNode[1].next[0].sizeInDwords = 2 +userDataNode[1].next[0].set = 0x0000005D +userDataNode[1].next[0].binding = 17 +userDataNode[1].next[1].type = DescriptorConstBuffer +userDataNode[1].next[1].offsetInDwords = 2 +userDataNode[1].next[1].sizeInDwords = 4 +userDataNode[1].next[1].set = 0x0000005D +userDataNode[1].next[1].binding = 0 +userDataNode[1].next[2].type = DescriptorBuffer +userDataNode[1].next[2].offsetInDwords = 6 +userDataNode[1].next[2].sizeInDwords = 4 +userDataNode[1].next[2].set = 0x0000005D +userDataNode[1].next[2].binding = 1 + +[RayTracingPipelineState] +groups[0].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR +groups[0].generalShader = 0 +groups[0].closestHitShader = -1 +groups[0].anyHitShader = -1 +groups[0].intersectionShader = -1 +groups[1].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR +groups[1].closestHitShader = 1 +maxRecursionDepth = 1 +indirectStageMask = 0xffffffff +libraryMode = 1 +mode = 1 +rtState.bvhResDescSize = 4 +rtState.bvhResDesc[0] = 0 +rtState.bvhResDesc[1] = 2197815296 +rtState.bvhResDesc[2] = 4294967295 +rtState.bvhResDesc[3] = 2164261887 +rtState.nodeStrideShift = 7 +rtState.threadGroupSizeX = 8 +rtState.threadGroupSizeY = 4 +rtState.threadGroupSizeZ = 1 +rtState.rayQueryCsSwizzle = 1 +rtState.ldsStackSize = 16 +rtState.dispatchRaysThreadGroupSize = 32 +rtState.ldsSizePerThreadGroup = 65536 +rtState.outerTileSize = 4 +rtState.dispatchDimSwizzleMode = 0 +rtState.enableDispatchRaysInnerSwizzle = 1 +rtState.enableDispatchRaysOuterSwizzle = 1 +rtState.enableOptimalLdsStackSizeForIndirect = 1 +rtState.enableOptimalLdsStackSizeForUnified = 1 +payloadSizeMaxInLib = 12 +attributeSizeMaxInLib = 8 +hasPipelineLibrary = 1 diff --git a/llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.rgen b/llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.rgen deleted file mode 100644 index b817bba209..0000000000 --- a/llpc/test/shaderdb/ray_tracing/TestKnownRayFlags.rgen +++ /dev/null @@ -1,30 +0,0 @@ -// BEGIN_SHADERTEST -/* -; RUN: amdllpc --print-after=lgc-lower-gpurt %gfxip 2>&1 %s | FileCheck -check-prefix=SHADERTEST %s -; Check whether ray flags are replaced by known values -; SHADERTEST: %{{.*}} = xor i32 -171, -1 -; SHADERTEST: %{{.*}} = or i32 %{{.*}}, 170 -*/ -// END_SHADERTEST - -#version 460 -#extension GL_EXT_ray_tracing : require - -struct RayPayload { - vec3 color; -}; - -layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh; -layout(location = 0) rayPayloadEXT RayPayload g_ray; - -void main() { - vec3 origin; - origin.x = gl_LaunchIDEXT.x; - origin.y = gl_LaunchIDEXT.y; - origin.z = 0; - - traceRayEXT(g_bvh, /* ray flags */ 0xAA, /* cull mask */ 0xff, - /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0, - origin.xyz, /* tmin */ 0.0, /* direction */ vec3(1, 0, 0), - /* tmax */ 48.0, /* payload location */ 0); -} diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineGs_TestOutputLocations.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineGs_TestOutputLocations.pipe deleted file mode 100644 index 9edc0ad5ad..0000000000 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineGs_TestOutputLocations.pipe +++ /dev/null @@ -1,69 +0,0 @@ -// This test case checks that a pipeline with geometry shader will place the Gs outputs in a position that matches the -// Fs inputs. - -; BEGIN_SHADERTEST -; RUN: amdllpc -enable-relocatable-shader-elf -o %t.elf %gfxip %s -; RUN: llvm-objdump --triple=amdgcn --mcpu=gfx1010 -d %t.elf | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST-LABEL: <_amdgpu_vs_main>: -; SHADERTEST: exp param[[loc:[0-9]*]] v{{[0-9]*}}, v{{[0-9]*}}, v{{[0-9]*}}, v{{[0-9]*}} -; SHADERTEST-LABEL: <_amdgpu_ps_main>: -; SHADERTEST: v_interp_p1_f32_e32 v{{[0-9]*}}, v{{[0-9]*}}, attr[[loc]].x -; SHADERTEST: v_interp_p1_f32_e32 v{{[0-9]*}}, v{{[0-9]*}}, attr[[loc]].y -; SHADERTEST: v_interp_p2_f32_e32 v{{[0-9]*}}, v{{[0-9]*}}, attr[[loc]].x -; SHADERTEST: v_interp_p2_f32_e32 v{{[0-9]*}}, v{{[0-9]*}}, attr[[loc]].y - -; END_SHADERTEST - -[Version] -version = 49 - -[VsGlsl] -#version 450 - -void main() -{ - gl_PointSize = 1.0; -} - -[VsInfo] -entryPoint = main - -[GsGlsl] -#version 450 -layout(points) in; -layout(max_vertices = 2, line_strip) out; - -layout(location = 1) out vec4 o1; - -void main() -{ - gl_Position = vec4(0.0, 1.0, 2.0, 1.0); - o1 = vec4(3.0, 4.0, 5.0, 1.0); - EmitVertex(); - gl_Position.x = 1.0; - EmitVertex(); -} - - -[GsInfo] -entryPoint = main - -[FsGlsl] -#version 450 - -layout(location = 1) in vec4 v1; -layout(location = 0) out vec4 o0; - -void main() -{ - o0 = vec4(v1.xy, 0.0, 1.0); -} - -[FsInfo] -entryPoint = main - -[GraphicsPipelineState] -colorBuffer[0].format = VK_FORMAT_R8G8B8A8_UNORM -colorBuffer[0].channelWriteMask = 15 -colorBuffer[0].blendEnable = 1 -colorBuffer[0].blendSrcAlphaToColor = 0 diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe index dc6f071193..3329fa71cb 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe @@ -11,28 +11,7 @@ ; SHADERTEST: .fp16_interp_mode: 0 ; SHADERTEST: .offset: 0x0000000000000000 ; SHADERTEST: .prim_attr: 0 -; SHADERTEST: .pt_sprite_tex: 0 }{ -; SHADERTEST: .attr0_valid: 0x0000000000000000 -; SHADERTEST: .attr1_valid: 0x0000000000000000 -; SHADERTEST: .flat_shade: 0 -; SHADERTEST: .fp16_interp_mode: 0 -; SHADERTEST: .offset: 0x0000000000000001 -; SHADERTEST: .prim_attr: 0 -; SHADERTEST: .pt_sprite_tex: 0 }{ -; SHADERTEST: .attr0_valid: 0x0000000000000000 -; SHADERTEST: .attr1_valid: 0x0000000000000000 -; SHADERTEST: .flat_shade: 0 -; SHADERTEST: .fp16_interp_mode: 0 -; SHADERTEST: .offset: 0x0000000000000002 -; SHADERTEST: .prim_attr: 0 -; SHADERTEST: .pt_sprite_tex: 0 }{ -; SHADERTEST: .attr0_valid: 0x0000000000000000 -; SHADERTEST: .attr1_valid: 0x0000000000000000 -; SHADERTEST: .flat_shade: 0 -; SHADERTEST: .fp16_interp_mode: 0 -; SHADERTEST: .offset: 0x0000000000000003 -; SHADERTEST: .prim_attr: 0 -; SHADERTEST: .pt_sprite_tex: 0 }] +; SHADERTEST: .pt_sprite_tex: 0 }] ; SHADERTEST: AMDLLPC SUCCESS ; END_SHADERTEST diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_TestRelocatableInOutMapping.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_TestRelocatableInOutMapping.pipe index cab13841e7..395a479702 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_TestRelocatableInOutMapping.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_TestRelocatableInOutMapping.pipe @@ -4,21 +4,20 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc -enable-relocatable-shader-elf -auto-layout-desc -v %gfxip %s \ ; RUN: | FileCheck -check-prefix=SHADERTEST %s +; Note: FS input location should match SPI_PS_INPUT_CNTL_*, instead of VS output location ; SHADERTEST-LABEL: {{^//}} LLPC location input/output mapping results (VS){{$}} ; ; SHADERTEST: (VS) Output: [location, component] = [0, 0] => Mapped = [[[loc0:[0-9]+]], 0] ; SHADERTEST-NEXT: (VS) Output: [location, component] = [1, 0] => Mapped = [[[loc1:[0-9]+]], 0] -; SHADERTEST-NEXT: (VS) Output: [location, component] = [2, 0] => Mapped = [[[loc2:[0-9]+]], 0] -; SHADERTEST-NEXT: (VS) Output: [location, component] = [3, 0] => Mapped = [[[loc3:[0-9]+]], 0] -; SHADERTEST-NEXT: (VS) Output: [location, component] = [4, 0] => Mapped = [[[loc4:[0-9]+]], 0] +; SHADERTEST-NEXT: (VS) Output: [location, component] = [3, 0] => Mapped = [[[loc2:[0-9]+]], 0] +; SHADERTEST-NEXT: (VS) Output: [location, component] = [4, 0] => Mapped = [[[loc3:[0-9]+]], 0] ; ; SHADERTEST-LABEL: {{^//}} LLPC location input/output mapping results (FS){{$}} ; ; SHADERTEST: (FS) Input: [location, component] = [0, 0] => Mapped = [[[loc0]], 0] ; SHADERTEST-NEXT: (FS) Input: [location, component] = [1, 0] => Mapped = [[[loc1]], 0] -; SHADERTEST-NEXT: (FS) Input: [location, component] = [2, 0] => Mapped = [[[loc2]], 0] -; SHADERTEST-NEXT: (FS) Input: [location, component] = [3, 0] => Mapped = [[[loc3]], 0] -; SHADERTEST-NEXT: (FS) Input: [location, component] = [4, 0] => Mapped = [[[loc4]], 0] +; SHADERTEST-NEXT: (FS) Input: [location, component] = [3, 0] => Mapped = [[[loc2]], 0] +; SHADERTEST-NEXT: (FS) Input: [location, component] = [4, 0] => Mapped = [[[loc3]], 0] ; ; SHADERTEST: (FS) Output: [location, component] = [0, 0] => Mapped = [0, 0] ; diff --git a/llpc/tool/amdllpc.cpp b/llpc/tool/amdllpc.cpp index 9515f034b0..ce4f0e3a4f 100644 --- a/llpc/tool/amdllpc.cpp +++ b/llpc/tool/amdllpc.cpp @@ -719,6 +719,7 @@ static Error processInputs(ICompiler *compiler, InputSpecGroup &inputSpecs, bool compileInfo.rayTracePipelineInfo.indirectStageMask = 0xFFFFFFFF; compileInfo.rayTracePipelineInfo.pipelineLibStageMask = 0xFFFFFFFF; compileInfo.rayTracePipelineInfo.hasPipelineLibrary = true; + compileInfo.rayTracePipelineInfo.libraryMode = LibraryMode::Library; standaloneRtShaders.resize(compileInfo.shaderModuleDatas.size()); memset(&standaloneRtShaders[0], 0, sizeof(PipelineShaderInfo) * standaloneRtShaders.size()); diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp index 2b5f8eee30..e1b7cd8455 100644 --- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp +++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp @@ -45,12 +45,12 @@ #include "SPIRVType.h" #include "SPIRVUtil.h" #include "SPIRVValue.h" -#include "compilerutils/TypesMetadata.h" #include "llpcCompiler.h" #include "llpcContext.h" #include "llpcDialect.h" #include "llpcPipelineContext.h" #include "llpcRayTracingContext.h" +#include "compilerutils/TypesMetadata.h" #include "llvmraytracing/ContinuationsUtil.h" #include "lgc/LgcDialect.h" #include "lgc/LgcRtDialect.h" @@ -6244,8 +6244,17 @@ SmallVector SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu return mapValue(bv, getBuilder()->create( matrixType, co, val, basicSrcElemTy, basicDstElemTy, srcLayout, dstLayout, "fConvert")); } - if (val->getType()->getScalarType()->getPrimitiveSizeInBits() <= destTy->getScalarType()->getPrimitiveSizeInBits()) + unsigned valTypeBitWide = val->getType()->getScalarType()->getPrimitiveSizeInBits(); + unsigned destTypeBitWide = destTy->getScalarType()->getPrimitiveSizeInBits(); + if (valTypeBitWide < destTypeBitWide) return mapValue(bv, getBuilder()->CreateFPExt(val, destTy)); + else if (valTypeBitWide == destTypeBitWide) { + assert(val->getType()->getScalarType()->isBFloatTy() || val->getType()->getScalarType()->isHalfTy()); + val = getBuilder()->CreateFPExt( + val, destTy->isVectorTy() + ? FixedVectorType::get(getBuilder()->getFloatTy(), cast(destTy)->getNumElements()) + : getBuilder()->getFloatTy()); + } RoundingMode rm = RoundingMode::Dynamic; SPIRVFPRoundingModeKind rounding; @@ -8779,6 +8788,13 @@ bool SPIRVToLLVM::transMetadata() { meshMode.workgroupSizeZ = overrideShaderGroupSizeZ; } + if (bf->getExecutionMode(ExecutionModeDerivativeGroupQuadsNV)) + meshMode.derivativeMode = DerivativeMode::Quads; + else if (bf->getExecutionMode(ExecutionModeDerivativeGroupLinearNV)) + meshMode.derivativeMode = DerivativeMode::Linear; + else + meshMode.derivativeMode = DerivativeMode::None; + Pipeline::setMeshShaderMode(*m_m, meshMode); } else if (execModel == ExecutionModelFragment) { FragmentShaderMode fragmentMode = {}; @@ -8821,6 +8837,11 @@ bool SPIRVToLLVM::transMetadata() { bf->getExecutionMode(ExecutionModeSampleInterlockOrderedEXT) || bf->getExecutionMode(ExecutionModeSampleInterlockUnorderedEXT)) fragmentMode.enablePops = true; + auto llpcContext = static_cast(m_context); + auto pipelineBuildInfo = + static_cast(llpcContext->getPipelineBuildInfo()); + if (pipelineBuildInfo->advancedBlendInfo.enableRov) + fragmentMode.enablePops = true; fragmentMode.waveOpsRequireHelperLanes = m_maximallyReconverges && m_hasDemoteToHelper; @@ -8869,14 +8890,13 @@ bool SPIRVToLLVM::transMetadata() { ComputeShaderMode computeMode = {}; if (bf->getExecutionMode(ExecutionModeDerivativeGroupQuadsNV)) - computeMode.derivatives = DerivativeMode::Quads; + computeMode.derivativeMode = DerivativeMode::Quads; else if (bf->getExecutionMode(ExecutionModeDerivativeGroupLinearNV)) - computeMode.derivatives = DerivativeMode::Linear; + computeMode.derivativeMode = DerivativeMode::Linear; else - computeMode.derivatives = DerivativeMode::None; - + computeMode.derivativeMode = DerivativeMode::None; if (bf->getExecutionMode(ExecutionModeQuadDerivativesKHR)) - computeMode.derivatives = DerivativeMode::Quads; + computeMode.derivativeMode = DerivativeMode::Quads; unsigned overrideShaderGroupSizeX = m_shaderOptions->overrideShaderThreadGroupSizeX; unsigned overrideShaderGroupSizeY = m_shaderOptions->overrideShaderThreadGroupSizeY; @@ -10904,10 +10924,6 @@ void SPIRVToLLVM::createXfbMetadata(bool hasXfbOuts) { auto llpcContext = static_cast(m_context); auto pipelineBuildInfo = static_cast(llpcContext->getPipelineBuildInfo()); bool needXfbMetadata = hasXfbOuts && !pipelineBuildInfo->getGlState().apiXfbOutData.forceDisableStreamOut; -#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 70 - needXfbMetadata |= pipelineBuildInfo->apiXfbOutData.forceEnablePrimStats; -#endif - if (!needXfbMetadata) return; diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.h b/llpc/translator/lib/SPIRV/SPIRVReader.h index df5a3c0527..0ef2965404 100644 --- a/llpc/translator/lib/SPIRV/SPIRVReader.h +++ b/llpc/translator/lib/SPIRV/SPIRVReader.h @@ -43,8 +43,8 @@ #include "SPIRVInternal.h" #include "SPIRVModule.h" #include "SPIRVToLLVMDbgTran.h" -#include "compilerutils/LoweringPointerTupleMap.h" #include "vkgcDefs.h" +#include "compilerutils/LoweringPointerTupleMap.h" #include "lgc/Builder.h" namespace llvm { @@ -391,7 +391,6 @@ class SPIRVToLLVM { // Returns a cached type store size. If there is no entry for the given type, // its store size is calculated and added to the cache. uint64_t getTypeStoreSize(Type *const t); - // If a value is mapped twice, the existing mapped value is a placeholder, // which must be a load instruction of a global variable whose name starts // with kPlaceholderPrefix. diff --git a/llpc/translator/lib/SPIRV/hex_float.h b/llpc/translator/lib/SPIRV/hex_float.h index a5294b298a..a6ffe50072 100644 --- a/llpc/translator/lib/SPIRV/hex_float.h +++ b/llpc/translator/lib/SPIRV/hex_float.h @@ -57,6 +57,25 @@ class Float16 { uint16_t val; }; +class BFloat16 { +public: + BFloat16(uint16_t v) : val(v) {} + BFloat16() {} + static bool isNan(const BFloat16 &val) { return ((val.val & 0x7F80) == 0x7F80) && ((val.val & 0x7F) != 0); } + // Returns true if the given value is any kind of infinity. + static bool isInfinity(const BFloat16 &val) { return ((val.val & 0x7F80) == 0x7F80) && ((val.val & 0x7F) == 0); } + BFloat16(const BFloat16 &other) { val = other.val; } + uint16_t get_value() const { return val; } + + // Returns the maximum normal value. + static BFloat16 max() { return BFloat16(0x7f7f); } + // Returns the lowest normal value. + static BFloat16 lowest() { return BFloat16(0xff7f); } + +private: + uint16_t val; +}; + // To specialize this type, you must override uint_type to define // an unsigned integer that can fit your floating point type. // You must also add a isNan function that returns true if @@ -96,6 +115,17 @@ template <> struct FloatProxyTraits { static Float16 lowest() { return Float16::lowest(); } }; +template <> struct FloatProxyTraits { + typedef uint16_t uint_type; + static bool isNan(BFloat16 f) { return BFloat16::isNan(f); } + // Returns true if the given value is any kind of infinity. + static bool isInfinity(BFloat16 f) { return BFloat16::isInfinity(f); } + // Returns the maximum normal value. + static BFloat16 max() { return BFloat16::max(); } + // Returns the lowest normal value. + static BFloat16 lowest() { return BFloat16::lowest(); } +}; + // Since copying a floating point number (especially if it is NaN) // does not guarantee that bits are preserved, this class lets us // store the type and use it as a float when necessary. @@ -213,6 +243,19 @@ template <> struct HexFloatTraits> { static const uint_type exponent_bias = 15; }; +// Traits for IEEE brain float. +// 1 sign bit, 8 exponent bits, 7 fractional bits. +template <> struct HexFloatTraits> { + typedef uint16_t uint_type; + typedef int16_t int_type; + typedef uint16_t underlying_type; + typedef uint16_t native_type; + static const uint_type num_used_bits = 16; + static const uint_type num_exponent_bits = 8; + static const uint_type num_fraction_bits = 7; + static const uint_type exponent_bias = 127; +}; + enum round_direction { kRoundToZero, kRoundToNearestEven, kRoundToPositiveInfinity, kRoundToNegativeInfinity }; // Template class that houses a floating pointer number. diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp index 279d1b92af..df8d706f6e 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.cpp @@ -195,7 +195,7 @@ class SPIRVModuleImpl : public SPIRVModule { SPIRVTypeRuntimeArray *addRuntimeArray(SPIRVType *) override; SPIRVTypeStruct *addStructType(const std::vector &vecTypes) override; SPIRVTypeBool *addBoolType() override; - SPIRVTypeFloat *addFloatType(unsigned BitWidth) override; + SPIRVTypeFloat *addFloatType(unsigned BitWidth, unsigned Encoding) override; SPIRVTypeFunction *addFunctionType(SPIRVType *, const std::vector &) override; SPIRVTypeInt *addIntegerType(unsigned BitWidth) override; SPIRVTypePointer *addPointerType(SPIRVStorageClassKind, SPIRVType *) override; @@ -682,8 +682,8 @@ SPIRVTypeInt *SPIRVModuleImpl::addIntegerType(unsigned BitWidth) { return addType(Ty); } -SPIRVTypeFloat *SPIRVModuleImpl::addFloatType(unsigned BitWidth) { - SPIRVTypeFloat *T = addType(new SPIRVTypeFloat(this, getId(), BitWidth)); +SPIRVTypeFloat *SPIRVModuleImpl::addFloatType(unsigned BitWidth, unsigned Encoding) { + SPIRVTypeFloat *T = addType(new SPIRVTypeFloat(this, getId(), BitWidth, Encoding)); return T; } diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h index afa46ef3f9..f1757ee767 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVModule.h @@ -188,7 +188,7 @@ class SPIRVModule { virtual SPIRVTypeRuntimeArray *addRuntimeArray(SPIRVType *) = 0; virtual SPIRVTypeStruct *addStructType(const std::vector &vecTypes) = 0; virtual SPIRVTypeBool *addBoolType() = 0; - virtual SPIRVTypeFloat *addFloatType(unsigned) = 0; + virtual SPIRVTypeFloat *addFloatType(unsigned, unsigned) = 0; virtual SPIRVTypeFunction *addFunctionType(SPIRVType *, const std::vector &) = 0; virtual SPIRVTypeImage *addImageType(SPIRVType *, const SPIRVTypeImageDescriptor &) = 0; virtual SPIRVTypeSampler *addSamplerType() = 0; diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp index 9c3cf9634c..f172848ebf 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.cpp @@ -298,6 +298,10 @@ bool SPIRVType::isTypeCooperativeMatrixKHR() const { return OpCode == OpTypeCooperativeMatrixKHR; } +void SPIRVTypeFloat::decode(std::istream &I) { + getDecoder(I) >> (Id) >> (BitWidth); +} + bool SPIRVType::isTypeVectorBool() const { return isTypeVector() && getVectorComponentType()->isTypeBool(); } diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h index 13f209037a..560395a3d2 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVType.h @@ -179,9 +179,12 @@ class SPIRVTypeInt : public SPIRVType { class SPIRVTypeFloat : public SPIRVType { public: static const Op OC = OpTypeFloat; + static const SPIRVWord FixedWC = 3; // Complete constructor - SPIRVTypeFloat(SPIRVModule *M, SPIRVId TheId, unsigned TheBitWidth) - : SPIRVType(M, 3, OC, TheId), BitWidth(TheBitWidth) {} + SPIRVTypeFloat(SPIRVModule *M, SPIRVId TheId, unsigned TheBitWidth, unsigned TheEncoding) + : SPIRVType(M, 3, OC, TheId), BitWidth(TheBitWidth), Encoding(TheEncoding) { + (void(Encoding)); // Unused + } // Incomplete constructor SPIRVTypeFloat() : SPIRVType(OC), BitWidth(0) {} @@ -191,18 +194,20 @@ class SPIRVTypeFloat : public SPIRVType { SPIRVCapVec CV; if (isTypeFloat(64)) CV.push_back(CapabilityFloat64); + return CV; } protected: - _SPIRV_DEF_DECODE2(Id, BitWidth) + _SPIRV_DCL_DECODE void validate() const override { SPIRVEntry::validate(); assert(BitWidth >= 16 && BitWidth <= 64 && "Invalid bit width"); } private: - unsigned BitWidth; // Bit width + unsigned BitWidth; // Bit width + SPIRVWord Encoding; // FP encoding }; class SPIRVTypePointer : public SPIRVType { diff --git a/llpc/util/llpcShaderModuleHelper.cpp b/llpc/util/llpcShaderModuleHelper.cpp index 4ac381ba2a..c9acab887b 100644 --- a/llpc/util/llpcShaderModuleHelper.cpp +++ b/llpc/util/llpcShaderModuleHelper.cpp @@ -150,6 +150,7 @@ ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(const BinaryData shaderModuleUsage.useFragCoord = true; break; } + case BuiltInViewportIndex: case BuiltInPointCoord: case BuiltInLayer: case BuiltInClipDistance: diff --git a/llvmraytracing/include/lgc/GpurtDialect.td b/llvmraytracing/include/lgc/GpurtDialect.td index 89b966ea5c..a632dd16dc 100644 --- a/llvmraytracing/include/lgc/GpurtDialect.td +++ b/llvmraytracing/include/lgc/GpurtDialect.td @@ -360,6 +360,12 @@ def GpurtGetRayStaticIdOp : GpurtOp<"get.ray.static.id", [Memory<[(read Inaccess let summary = "get current ray static ID"; } +def GpurtInitStaticIdOp : GpurtOp<"init.static.id", [Memory<[]>, WillReturn]> { + let arguments = (ins); + let results = (outs I32:$result); + let summary = "Initialize (generate) a ray static ID"; +} + def GpurtContinuationStackIsGlobalOp : GpurtOp<"continuation.stack.is.global", [Memory<[]>, WillReturn]> { let arguments = (ins); let results = (outs I1:$result); diff --git a/llvmraytracing/include/lgc/LgcCpsDialect.td b/llvmraytracing/include/lgc/LgcCpsDialect.td index 2e796dc9fe..e7d7654e29 100644 --- a/llvmraytracing/include/lgc/LgcCpsDialect.td +++ b/llvmraytracing/include/lgc/LgcCpsDialect.td @@ -43,7 +43,7 @@ def ContinuationReference : TgConstant<(or I32, I64)>, Type; // ===================================================================================================================== def JumpOp : LgcCpsOp<"jump", [NoReturn]> { - let arguments = (ins ContinuationReference:$target, AttrI32:$levels, value:$state, varargs:$tail); + let arguments = (ins ContinuationReference:$target, AttrI32:$levels, value:$state, ContinuationReference:$rcr, varargs:$tail); let results = (outs); let summary = "Jump to a CPS function."; @@ -52,6 +52,7 @@ def JumpOp : LgcCpsOp<"jump", [NoReturn]> { * target, the continuation reference * levels, a bitmask of levels in which target may run * state, which is pushed to the continuation stack before jumping, + * rcr, a continuation reference the called function can potentially return to * an arbitrary set of arguments appended to the tail of the argument list. }]; } @@ -72,6 +73,18 @@ def AwaitOp : LgcCpsOp<"await", [NoUnwind, WillReturn]> { }]; } +def CompleteOp : LgcCpsOp<"complete", [NoReturn]> { + let arguments = (ins); + let results = (outs); + + let summary = + "represents lane termination for a shader"; + + let description = [{ + Describes the lane termination for a shader (e. g. to end RGS). + }]; +} + // ===================================================================================================================== def AsContinuationReferenceOp : LgcCpsOp<"as.continuation.reference", [NoUnwind, WillReturn]> { let arguments = (ins PointerType:$fn); diff --git a/llvmraytracing/include/lgc/LgcIlCpsDialect.td b/llvmraytracing/include/lgc/LgcIlCpsDialect.td index 605a901ed9..e5ff5fb77b 100644 --- a/llvmraytracing/include/lgc/LgcIlCpsDialect.td +++ b/llvmraytracing/include/lgc/LgcIlCpsDialect.td @@ -91,21 +91,3 @@ def WaitContinueOp : LgcIlCpsOp<"waitContinue", [NoReturn]> { - tail, a set of arguments like the system data or hit attributes. }]; } - -def ReturnOp : LgcIlCpsOp<"return", [NoReturn]> { - let arguments = (ins value:$returnAddr, varargs:$args); - let results = (outs); - - let summary = - "represents the return from a shader"; - - let description = [{ - Describes the return operation for a continuation shader. - - In non-lgc.cps mode, this is used to jump to the incoming return address - for non-RGS, and optionally passing return values in the varargs list. - - For RGS, this is used to terminate the shader after coroutine passes - by passing an undef (non-lgc.cps mode)/poison (lgc.cps mode) address. - }]; -} diff --git a/llvmraytracing/include/lgc/LgcRtDialect.h b/llvmraytracing/include/lgc/LgcRtDialect.h index 3dc2721540..3a79d26fe7 100644 --- a/llvmraytracing/include/lgc/LgcRtDialect.h +++ b/llvmraytracing/include/lgc/LgcRtDialect.h @@ -103,6 +103,9 @@ void setShaderPaq(llvm::Function *func, llvm::Constant *paq); // that that is the only information we have on the payload. llvm::Constant *getPaqFromSize(llvm::LLVMContext &context, size_t size); +// Get size in bytes from PAQ (payload access qualifier). +size_t getSizeFromPaq(llvm::Constant *paq); + // Get arg size (in bytes) metadata for a ray-tracing callable shader function. size_t getShaderArgSize(llvm::Function *func); diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h index 7ff58efa64..e05a875c9d 100644 --- a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h +++ b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h @@ -198,8 +198,6 @@ class ContHelper { // Flags set for continuations. // This is exposed to gpurt code via the ContinuationsGetFlags intrinsic. static constexpr const char *MDFlagsName = "continuation.flags"; - // Marks an await as a waiting one with a wait mask. - static constexpr const char *MDIsWaitAwaitName = "continuation.wait.await"; static std::optional extractZExtI32Constant(MDNode *Node) { if (Node) { @@ -399,8 +397,11 @@ class ContHelper { } static void setPayloadTypeMetadata(Instruction *I, Type *T) { - I->setMetadata(ContHelper::MDContPayloadTyName, - MDNode::get(I->getContext(), {ConstantAsMetadata::get(PoisonValue::get(T))})); + I->setMetadata(ContHelper::MDContPayloadTyName, getPayloadTypeMetadata(T)); + } + + static MDNode *getPayloadTypeMetadata(Type *T) { + return MDNode::get(T->getContext(), {ConstantAsMetadata::get(PoisonValue::get(T))}); } static std::optional tryGetWaitMask(const CallInst &CI) { @@ -411,20 +412,13 @@ class ContHelper { CI.setMetadata(MDWaitMaskName, getI32MDConstant(CI.getContext(), WaitMask)); } + // Queries whether an awaited call should wait on a wait mask. + static bool isWaitAwaitCall(const CallInst &CI) { return CI.getMetadata(MDWaitMaskName) != nullptr; } + static void removeWaitMask(CallInst &CI) { CI.setMetadata(MDWaitMaskName, nullptr); } static bool isLgcCpsModule(Module &Mod) { return Mod.getNamedMetadata(MDLgcCpsModuleName) != nullptr; } - // Specifies that an awaited call should wait on a wait mask. - static void setIsWaitAwaitCall(CallInst &CI) { - CI.setMetadata(ContHelper::MDIsWaitAwaitName, MDTuple::get(CI.getContext(), {})); - } - - // Queries whether an awaited call should wait on a wait mask. - static bool isWaitAwaitCall(const CallInst &CI) { return CI.getMetadata(MDIsWaitAwaitName) != nullptr; } - - static void removeIsWaitAwaitMetadata(CallInst &CI) { CI.setMetadata(ContHelper::MDIsWaitAwaitName, nullptr); } - /// Returns true if a call to the given function should be rematerialized /// in a shader of the specified kind. /// @@ -454,6 +448,9 @@ class ContHelper { // returns true, enabling new behavior (e.g. for tests). static bool getGpurtVersionFlag(Module &GpurtModule, GpuRtVersionFlag Flag); + // Handles the _AmdComplete intrinsic. + static void handleComplete(Function &Func); + // Handles _AmdGetSetting_* intrinsics. static void handleGetSetting(Function &F, ArrayRef Settings); diff --git a/llvmraytracing/lib/CleanupContinuations.cpp b/llvmraytracing/lib/CleanupContinuations.cpp index 0f604c3310..93e1f8f66b 100644 --- a/llvmraytracing/lib/CleanupContinuations.cpp +++ b/llvmraytracing/lib/CleanupContinuations.cpp @@ -49,7 +49,7 @@ // the compiler backend. // 1. Replace returning handle with lgc.cps.jump() with the right continuation // reference. -// 2. Replace @lgc.ilcps.return with simple `ret`, which means thread +// 2. Replace @lgc.cps.complete with simple `ret`, which means thread // termination. // 3. Edit function signatures, like removing coroutine frame pointer argument, // adding needed ones (state, rcr, returned_values) for resume function. @@ -181,6 +181,8 @@ void CleanupContinuationsPass::updateCpsStack(Function *F, Function *NewFunc, bo CpsStack = Builder->create(Builder->getInt32(CpsInfo.ContStateBytes)); CpsStack->setName("cont.state.stack.segment"); } else { + // We don't expect stack size metadata on resume functions. + ContHelper::StackSize::reset(NewFunc); CpsStack = Builder->create(Builder->getInt32(CpsInfo.ContStateBytes)); } @@ -276,10 +278,7 @@ void CleanupContinuationsPass::removeContFreeCall(Function *F, Function *ContFre } } -/// Insert cps.free() before the original function exits. -/// Note: we skip the cps.free() insertion before calls to -/// @lgc.ilcps.return. Because this is not useful any more as it means the -/// thread termination. +/// Insert cps.free() before the original function exits and lgc.cps.complete calls. void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsInfo) { struct VisitState { ContinuationData &CpsInfo; @@ -290,9 +289,9 @@ void CleanupContinuationsPass::freeCpsStack(Function *F, ContinuationData &CpsIn static const auto Visitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) - .add([](auto &State, auto &Jump) { - if (Jump.getFunction() == State.F && State.CpsInfo.ContStateBytes) { - State.Builder->SetInsertPoint(&Jump); + .addSet([](auto &State, auto &Instruction) { + if (Instruction.getFunction() == State.F && State.CpsInfo.ContStateBytes) { + State.Builder->SetInsertPoint(&Instruction); State.Builder->template create(State.Builder->getInt32(State.CpsInfo.ContStateBytes)); } }) @@ -364,16 +363,15 @@ void CleanupContinuationsPass::processContinuations() { if (isa(I)) { handleContinue(FuncData.second, I); } else if (I->getOpcode() == Instruction::Unreachable) { - // We should only possibly have 'lgc.ilcps.return' or - // 'lgc.cps.jump' call before unreachable. + // We should only have 'lgc.cps.complete' or 'lgc.cps.jump' calls before unreachable. auto *Call = cast(--I->getIterator()); - if (isa(Call)) { + if (isa(Call)) { Builder->SetInsertPoint(Call); Builder->CreateRetVoid(); Call->eraseFromParent(); I->eraseFromParent(); } else { - assert(isa(*Call)); + assert(isa(Call)); } } } @@ -445,31 +443,27 @@ void CleanupContinuationsPass::handleSingleContinue(ContinuationData &Data, Call SmallVector TailArgs; uint32_t SkipCount = 2; + Value *ResumeAddr = nullptr; + const bool IsWait = ContHelper::isWaitAwaitCall(*Call); // WaitMask and %rcr (aka. return continuation reference) for the callee. if (cps::isCpsFunction(*cast(ResumeFun))) { - // Ensure the first argument stays the wait mask. This comes after the CR - // and the levels. - if (ContHelper::isWaitAwaitCall(*Call)) { - TailArgs.push_back(Call->getArgOperand(2)); - ++SkipCount; - } - - auto *ResumeCR = Builder->create(ContinuationReferenceType, ResumeFun); - - TailArgs.push_back(ResumeCR); + ResumeAddr = Builder->create(ContinuationReferenceType, ResumeFun); + if (IsWait) + SkipCount = 3; } else { // For entry-point compute kernel, pass a poison %rcr. - TailArgs.push_back(PoisonValue::get(Builder->getInt32Ty())); + ResumeAddr = PoisonValue::get(Builder->getInt32Ty()); } // Skip continuation.reference, levels and potentially the wait mask. TailArgs.append(SmallVector(drop_begin(Call->args(), SkipCount))); auto *CR = Call->getArgOperand(0); - Value *Level = Call->getArgOperand(ContHelper::isWaitAwaitCall(*Call) ? 2 : 1); + + Value *Level = Call->getArgOperand(IsWait ? 2 : 1); unsigned LevelImm = cast(Level)->getZExtValue(); - // TODO: Continuation state are passed through stack for now. + // TODO: Continuation state is passed through stack for now. auto *State = PoisonValue::get(StructType::get(Builder->getContext(), {})); - auto *JumpCall = Builder->create(CR, LevelImm, State, TailArgs); + auto *JumpCall = Builder->create(CR, LevelImm, State, ResumeAddr, TailArgs); // Replace this instruction with a call to cps.jump. JumpCall->copyMetadata(*Call); @@ -537,11 +531,8 @@ void CleanupContinuationsPass::lowerGetResumePoint(Module &Mod) { // call. auto JumpCall = findDominatedContinueCall(GetResumeCall); assert(JumpCall && "Should find a dominated call to lgc.cps.jump"); - // For wait calls, skip the wait mask. - uint32_t SkipCount = ContHelper::isWaitAwaitCall(*(JumpCall.value())) ? 1 : 0; - lgc::cps::JumpOp *Jump = cast(*JumpCall); - Value *ResumeFn = *(Jump->getTail().begin() + SkipCount); + Value *ResumeFn = Jump->getRcr(); assert(ResumeFn && isa(ResumeFn)); // We can always move this as.continuation.reference call. cast(ResumeFn)->moveBefore(GetResumeCall); diff --git a/llvmraytracing/lib/Continuations.cpp b/llvmraytracing/lib/Continuations.cpp index cc152e878d..29c616181d 100644 --- a/llvmraytracing/lib/Continuations.cpp +++ b/llvmraytracing/lib/Continuations.cpp @@ -31,6 +31,7 @@ #include "llvmraytracing/Continuations.h" #include "compilerutils/CompilerUtils.h" +#include "compilerutils/DxilToLlvm.h" #include "llvmraytracing/ContinuationsUtil.h" #include "llvmraytracing/GpurtContext.h" #include "lgc/LgcCpsDialect.h" @@ -547,6 +548,11 @@ void ContHelper::addDxilContinuationPasses(ModulePassManager &MPM, Module *Gpurt MPM.addPass(DXILContPreHookPass()); + // Fixup DXIL vs LLVM incompatibilities. This needs to run first. + // If we add more LLVM processing separate from continuation passes, + // we potentially should do it earlier as part of the module loading. + MPM.addPass(CompilerUtils::DxilToLlvmPass()); + // Translate dx.op intrinsic calls to lgc.rt dialect intrinsic calls MPM.addPass(DXILContLgcRtOpConverterPass()); @@ -564,6 +570,8 @@ void ContHelper::addDxilContinuationPasses(ModulePassManager &MPM, Module *Gpurt } void ContHelper::addDxilGpurtLibraryPasses(ModulePassManager &MPM) { + MPM.addPass(CompilerUtils::DxilToLlvmPass()); + MPM.addPass(llvm::DXILContIntrinsicPreparePass()); MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/false)); @@ -841,47 +849,39 @@ static bool replaceEnqueueIntrinsic(Function &F) { bool IsWaitEnqueue = FuncName.contains("WaitEnqueue"); llvm_dialects::Builder B{F.getContext()}; - auto CreateContinue = [&B](const CallInst &CInst, SmallVectorImpl &TailArgs, - std::optional ReturnAddr) -> CallInst * { - Value *ShaderAddr = CInst.getArgOperand(0); - TailArgs.append(CInst.arg_begin() + 2, CInst.arg_end()); - return B.create(ShaderAddr, PoisonValue::get(B.getInt32Ty()), - ReturnAddr.value_or(CInst.getArgOperand(1)), TailArgs); - }; - - auto CreateWaitContinue = [&B](const CallInst &CInst, SmallVectorImpl &TailArgs, - std::optional ReturnAddr) -> CallInst * { - Value *ShaderAddr = CInst.getArgOperand(0); - TailArgs.append(CInst.arg_begin() + 3, CInst.arg_end()); - Value *WaitMask = CInst.getArgOperand(1); - return B.create(ShaderAddr, WaitMask, PoisonValue::get(B.getInt32Ty()), - ReturnAddr.value_or(CInst.getArgOperand(2)), TailArgs); - }; - llvm::forEachCall(F, [&](CallInst &CInst) { B.SetInsertPoint(&CInst); - SmallVector TailArgs; CallInst *NewCall = nullptr; + Value *WaitMask = nullptr; + Value *RetAddr = nullptr; if (IsEnqueueCall) { // Add the current function as return address to the call. // Used when Traversal calls AnyHit or Intersection. - auto *RetAddr = B.create(B.getInt64Ty(), CInst.getFunction()); - if (IsWaitEnqueue) { - // Handle WaitEnqueueCall. - NewCall = CreateWaitContinue(CInst, TailArgs, RetAddr); - } else { - // Handle EnqueueCall. - NewCall = CreateContinue(CInst, TailArgs, RetAddr); - } + RetAddr = B.create(B.getInt64Ty(), CInst.getFunction()); + // Handle WaitEnqueueCall. + if (IsWaitEnqueue) + WaitMask = CInst.getArgOperand(1); } else if (IsWaitEnqueue) { // Handle WaitEnqueue. - NewCall = CreateWaitContinue(CInst, TailArgs, std::nullopt); + WaitMask = CInst.getArgOperand(1); + RetAddr = CInst.getArgOperand(2); } else { - // Handle Enqueue. - NewCall = CreateContinue(CInst, TailArgs, std::nullopt); + RetAddr = CInst.getArgOperand(1); } + SmallVector TailArgs; + TailArgs.append(CInst.arg_begin() + (WaitMask ? 3 : 2), CInst.arg_end()); + + // For DX, these arguments are unused right now and are just here to fulfill the `JumpOp`s requirements as being + // defined in the LgcCpsDialect. + const uint32_t DummyLevelsArg = -1; + Value *DummyContState = PoisonValue::get(StructType::get(B.getContext())); + NewCall = B.create(CInst.getArgOperand(0), DummyLevelsArg, DummyContState, RetAddr, TailArgs); + + if (WaitMask) + ContHelper::setWaitMask(*NewCall, cast(WaitMask)->getSExtValue()); + // NOTE: Inlining ExitRayGen in LowerRaytracingPipeline can cause continue // ops whose name is suffixed .cloned.*, which don't get picked up by the // direct name comparison we use when checking for existence of payload @@ -947,6 +947,14 @@ static void handleGetUninitialized(Function &Func) { }); } +void ContHelper::handleComplete(Function &Func) { + llvm::forEachCall(Func, [&](llvm::CallInst &CInst) { + llvm_dialects::Builder B{&CInst}; + B.create(); + CInst.eraseFromParent(); + }); +} + void ContHelper::handleGetSetting(Function &F, ArrayRef Settings) { auto *Ty = dyn_cast(F.getReturnType()); if (!Ty) @@ -1079,7 +1087,7 @@ void llvm::terminateShader(IRBuilder<> &Builder, CallInst *CompleteCall) { assert(OldTerminator != CompleteCall && "terminateShader: Invalid terminator instruction provided!"); // If there is some code after the call to _AmdComplete or the intended - // lgc.ilcps.return that aborts the shader, do the following: + // lgc.cps.return that aborts the shader, do the following: // - Split everything after the completion call into a separate block // - Remove the newly inserted unconditional branch to the split block // - Remove the complete call. @@ -1132,6 +1140,9 @@ bool llvm::earlyDriverTransform(Module &M) { } else if (Name.starts_with("_AmdGetSetting")) { Changed = true; ContHelper::handleGetSetting(F, GpurtSettings); + } else if (Name.starts_with("_AmdComplete")) { + Changed = true; + ContHelper::handleComplete(F); } } diff --git a/llvmraytracing/lib/ContinuationsLint.cpp b/llvmraytracing/lib/ContinuationsLint.cpp index 4902ff7d91..68bdbc1ad1 100644 --- a/llvmraytracing/lib/ContinuationsLint.cpp +++ b/llvmraytracing/lib/ContinuationsLint.cpp @@ -31,7 +31,6 @@ #include "llvmraytracing/Continuations.h" #include "lgc/LgcCpsDialect.h" -#include "lgc/LgcIlCpsDialect.h" #include "llvm-dialects/Dialect/Visitor.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/PassManager.h" @@ -62,7 +61,7 @@ class ContinuationsLintPassImpl final { private: Module &Mod; - using JumpVecTy = SmallVector; + using JumpVecTy = SmallVector; JumpVecTy AllJumps; void collectJumps(); void checkJumpTargets(); @@ -115,10 +114,10 @@ void ContinuationsLintPassImpl::run() { } void ContinuationsLintPassImpl::collectJumps() { - static const auto Visitor = llvm_dialects::VisitorBuilder() - .addSet( - [](JumpVecTy &Jumps, Instruction &Op) { Jumps.push_back(cast(&Op)); }) - .build(); + static const auto Visitor = + llvm_dialects::VisitorBuilder() + .add([](JumpVecTy &Jumps, lgc::cps::JumpOp &Op) { Jumps.push_back(&Op); }) + .build(); Visitor.visit(AllJumps, Mod); } @@ -126,13 +125,7 @@ void ContinuationsLintPassImpl::collectJumps() { // Check that every possible jump candidate has a valid jump target void ContinuationsLintPassImpl::checkJumpTargets() { for (auto *JumpCandidate : AllJumps) { - Value *JumpTarget = nullptr; - if (auto *Continue = dyn_cast(JumpCandidate)) - JumpTarget = Continue->getShaderAddr(); - else if (auto *WaitContinue = dyn_cast(JumpCandidate)) - JumpTarget = WaitContinue->getShaderAddr(); - else if (auto *Jump = dyn_cast(JumpCandidate)) - JumpTarget = Jump->getTarget(); + Value *JumpTarget = JumpCandidate->getTarget(); assert(JumpTarget); diff --git a/llvmraytracing/lib/ContinuationsStatsReport.cpp b/llvmraytracing/lib/ContinuationsStatsReport.cpp index 1751a7259d..4c4844c0ce 100644 --- a/llvmraytracing/lib/ContinuationsStatsReport.cpp +++ b/llvmraytracing/lib/ContinuationsStatsReport.cpp @@ -44,6 +44,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" #include using namespace llvm; @@ -51,12 +52,20 @@ using namespace lgc::rt; #define DEBUG_TYPE "continuations-stats-report" +enum class PayloadRegisterSizeReportingMode : uint8_t { Disabled = 0, MaxOutgoing, ByJump }; + static cl::opt ReportContStateSizes("report-cont-state-sizes", cl::desc("Report continuation state sizes for entry functions."), cl::init(false)); -static cl::opt ReportPayloadRegisterSizes("report-payload-register-sizes", - cl::desc("Report payload VGPR sizes for functions."), cl::init(false)); +static cl::opt ReportPayloadRegisterSizes( + "report-payload-register-sizes", cl::init(PayloadRegisterSizeReportingMode::Disabled), + cl::desc("Report payload VGPR sizes for functions."), + cl::values(clEnumValN(PayloadRegisterSizeReportingMode::Disabled, "disabled", "Disable payload size reporting"), + clEnumValN(PayloadRegisterSizeReportingMode::MaxOutgoing, "max", + "Report incoming and maximum outgoing payload sizes"), + clEnumValN(PayloadRegisterSizeReportingMode::ByJump, "byjump", + "Reporting incoming register sizes and payload size for each jump"))); static cl::opt ReportSystemDataSizes("report-system-data-sizes", cl::desc("Report incoming system data sizes for functions."), @@ -92,12 +101,13 @@ ContinuationsStatsReportPassImpl::ContinuationsStatsReportPassImpl(Module &Mod) } void ContinuationsStatsReportPassImpl::run() { - if (!ReportPayloadRegisterSizes && !ReportSystemDataSizes && !ReportContStateSizes && !ReportAllSizes) + if (ReportPayloadRegisterSizes == PayloadRegisterSizeReportingMode::Disabled && !ReportSystemDataSizes && + !ReportContStateSizes && !ReportAllSizes) return; collectProcessableFunctions(); - if (ReportAllSizes || ReportPayloadRegisterSizes) + if (ReportAllSizes || ReportPayloadRegisterSizes != PayloadRegisterSizeReportingMode::Disabled) reportPayloadRegisterSizes(); if (ReportAllSizes || ReportSystemDataSizes) @@ -137,7 +147,8 @@ void ContinuationsStatsReportPassImpl::collectProcessableFunctions() { case RayTracingShaderStage::AnyHit: case RayTracingShaderStage::ClosestHit: case RayTracingShaderStage::Miss: - case RayTracingShaderStage::Callable: { + case RayTracingShaderStage::Callable: + case RayTracingShaderStage::Traversal: { FunctionData Data; Data.Stage = Stage; Data.SystemDataTy = F.getFunctionType()->getParamType(SystemDataArgumentIndex); @@ -165,41 +176,78 @@ void ContinuationsStatsReportPassImpl::reportContStateSizes() { } void ContinuationsStatsReportPassImpl::reportPayloadRegisterSizes() { - static const auto Visitor = llvm_dialects::VisitorBuilder>() - .addSet( - [](auto &FuncOutgoingRegCountMap, auto &CInst) { - auto RegCount = ContHelper::OutgoingRegisterCount::tryGetValue(&CInst).value(); - FuncOutgoingRegCountMap[CInst.getFunction()] = - std::max(FuncOutgoingRegCountMap[CInst.getFunction()], RegCount); - }) + using FuncJumpMapTy = DenseMap>>; + + static const auto Visitor = llvm_dialects::VisitorBuilder() + .add([](FuncJumpMapTy &ByJumpRegisterCounts, auto &CInst) { + auto RegCount = ContHelper::OutgoingRegisterCount::tryGetValue(&CInst).value(); + ByJumpRegisterCounts[CInst.getFunction()].push_back({&CInst, RegCount}); + }) .build(); - DenseMap MaxOutgoingRegisterCounts; - Visitor.visit(MaxOutgoingRegisterCounts, Mod); + FuncJumpMapTy ByJumpRegisterCounts; + Visitor.visit(ByJumpRegisterCounts, Mod); + + DenseMap MaxOutgoingRegisterCounts; + if (ReportPayloadRegisterSizes == PayloadRegisterSizeReportingMode::MaxOutgoing) { + // Accumulate all outgoing payload sizes per function. + for (auto &[Func, Jumps] : ByJumpRegisterCounts) { + for (auto &[Jump, RegCount] : Jumps) { + MaxOutgoingRegisterCounts[Func] = std::max(MaxOutgoingRegisterCounts[Func], RegCount); + } + } + } + + const StringRef SizeSuffix = " dwords"; + const auto ReportIncomingPayload = [&](Function &Func, std::optional OptIncomingPayloadRegisterCount, + DXILShaderKind ShaderKind, StringRef ReportSuffix, bool AppendSizeSuffix) { + dbgs() << ReportSuffix << " \"" << Func.getName() << "\" (" << ShaderKind << "): "; + if (OptIncomingPayloadRegisterCount.has_value()) { + dbgs() << OptIncomingPayloadRegisterCount.value(); + if (AppendSizeSuffix) + dbgs() << SizeSuffix; + } else { + dbgs() << "(no incoming payload)"; + } + }; for (auto &[Func, FuncData] : ToProcess) { DXILShaderKind ShaderKind = ShaderStageHelper::rtShaderStageToDxilShaderKind(FuncData.Stage.value()); auto OptIncomingPayloadRegisterCount = ContHelper::IncomingRegisterCount::tryGetValue(Func); bool HasIncomingPayload = OptIncomingPayloadRegisterCount.has_value(); - auto It = MaxOutgoingRegisterCounts.find(Func); - bool HasOutgoingPayload = (It != MaxOutgoingRegisterCounts.end()); - if (!HasIncomingPayload && !HasOutgoingPayload) - continue; + if (ReportPayloadRegisterSizes == PayloadRegisterSizeReportingMode::ByJump) { + auto It = ByJumpRegisterCounts.find(Func); + bool HasOutgoingPayload = (It != ByJumpRegisterCounts.end()); - dbgs() << "Incoming and max outgoing payload VGPR size of \"" << Func->getName() << "\" (" << ShaderKind << "): "; - if (HasIncomingPayload) { - dbgs() << OptIncomingPayloadRegisterCount.value() * RegisterBytes; - } else { - dbgs() << "(no incoming payload)"; - } - dbgs() << " and "; - if (HasOutgoingPayload) { - dbgs() << It->second * RegisterBytes; + if (!HasIncomingPayload && !HasOutgoingPayload) + continue; + + ReportIncomingPayload(*Func, OptIncomingPayloadRegisterCount, ShaderKind, "Incoming payload VGPR size of", true); + dbgs() << "\n"; + + if (HasOutgoingPayload) { + dbgs() << "Outgoing payload VGPR size by jump:\n"; + for (auto &[Jump, RegCount] : It->second) + dbgs() << *Jump << ": " << RegCount << SizeSuffix << '\n'; + } } else { - dbgs() << "(no outgoing payload)"; + auto It = MaxOutgoingRegisterCounts.find(Func); + bool HasOutgoingPayload = (It != MaxOutgoingRegisterCounts.end()); + + if (!HasIncomingPayload && !HasOutgoingPayload) + continue; + + ReportIncomingPayload(*Func, OptIncomingPayloadRegisterCount, ShaderKind, + "Incoming and max outgoing payload VGPR size of", false); + dbgs() << " and "; + if (HasOutgoingPayload) { + dbgs() << It->second; + } else { + dbgs() << "(no outgoing payload)"; + } + dbgs() << SizeSuffix << '\n'; } - dbgs() << " bytes\n"; } } diff --git a/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp b/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp index 810cfe4197..85031c1e6c 100644 --- a/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp +++ b/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp @@ -174,6 +174,15 @@ static bool isUtilFunction(StringRef Name) { return false; } +static void handleIsLlpc(Function &Func) { + assert(Func.arg_empty() + // bool + && Func.getFunctionType()->getReturnType()->isIntegerTy(1)); + + auto *FalseConst = ConstantInt::getFalse(Func.getContext()); + llvm::replaceCallsToFunction(Func, *FalseConst); +} + llvm::PreservedAnalyses DXILContIntrinsicPreparePass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AnalysisManager) { LLVM_DEBUG(dbgs() << "Run the dxil-cont-intrinsic-prepare pass\n"); @@ -191,8 +200,13 @@ llvm::PreservedAnalyses DXILContIntrinsicPreparePass::run(llvm::Module &M, ShouldTransform = true; else if (isUtilFunction(Name)) ShouldTransform = true; - } else if (Name.contains("_Amd") && isUtilFunction(Name)) { - ShouldTransform = true; + } else if (Name.contains("_Amd")) { + if (isUtilFunction(Name)) { + ShouldTransform = true; + } else if (Name.contains("IsLlpc")) { + ShouldTransform = false; + handleIsLlpc(*F); + } } if (ShouldTransform) diff --git a/llvmraytracing/lib/DXILContPostProcess.cpp b/llvmraytracing/lib/DXILContPostProcess.cpp index 279dcda750..84bf959c42 100644 --- a/llvmraytracing/lib/DXILContPostProcess.cpp +++ b/llvmraytracing/lib/DXILContPostProcess.cpp @@ -92,6 +92,7 @@ class DXILContPostProcessPassImpl final { bool handleIntrinsicCalls(llvm::ModuleAnalysisManager &AnalysisManager); bool lowerCpsOps(); void lowerJumpOp(lgc::cps::JumpOp &JumpOp); + void lowerAsContinuationReferenceOp(lgc::cps::AsContinuationReferenceOp &AsCrOp); bool handleAmdInternals(); bool cleanupIncomingPayloadMetadata(Function &F); bool cleanupOutgoingPayloadMetadata(); @@ -186,24 +187,11 @@ void DXILContPostProcessPassImpl::lowerGetResumePointAddr(Function &F) { if (auto *Jump = dyn_cast(ContinueCall)) { ReturnAddrArgNum = 3; - ReturnAddr = *Jump->getTail().begin(); - } else { - if (!isa(ContinueCall)) - report_fatal_error("The BB must end in a continue call after a " - "GetResumePointAddr"); - - if (auto *WaitContinue = dyn_cast(ContinueCall)) { - ReturnAddr = WaitContinue->getReturnAddr(); - ReturnAddrArgNum = 2; - } else { - ReturnAddr = cast(ContinueCall)->getReturnAddr(); - } - - // Move up computation of the resume address - - assert((ReturnAddr->getType() == Builder.getInt64Ty()) && "Unexpected return addr type!"); + ReturnAddr = Jump->getRcr(); } + assert((ReturnAddr->getType() == Builder.getInt64Ty()) && "Unexpected return addr type!"); + SmallVector MoveInstrs; if (auto *I = dyn_cast(ReturnAddr)) { if (!I->comesBefore(CInst)) @@ -504,11 +492,7 @@ bool DXILContPostProcessPassImpl::lowerCpsOps() { static const auto CpsVisitor = llvm_dialects::VisitorBuilder() .add( [](CpsVisitorState &State, lgc::cps::AsContinuationReferenceOp &AsCrOp) { - State.Builder.SetInsertPoint(&AsCrOp); - auto *AddrWithMD = - State.Builder.CreateCall(State.GetAddrAndMD, {AsCrOp.getFn()}); - AsCrOp.replaceAllUsesWith(AddrWithMD); - AsCrOp.eraseFromParent(); + State.Self.lowerAsContinuationReferenceOp(AsCrOp); State.Changed = true; }) .add([](CpsVisitorState &State, lgc::cps::JumpOp &JumpOp) { @@ -566,24 +550,76 @@ void DXILContPostProcessPassImpl::lowerJumpOp(lgc::cps::JumpOp &JumpOp) { Value *RCR = Builder.CreateZExt(JumpOp.getTarget(), Builder.getInt64Ty()); CallInst *ContinueOp = nullptr; - Value *ReturnAddr = *JumpOp.getTail().begin(); - SmallVector TailArgs{JumpOp.getTail().begin() + 1, JumpOp.getTail().end()}; + SmallVector TailArgs{JumpOp.getTail()}; + Value *RetAddr = Builder.CreateZExt(JumpOp.getRcr(), Builder.getInt64Ty()); if (auto WaitMask = ContHelper::tryGetWaitMask(JumpOp)) { - ContinueOp = Builder.create( - RCR, Builder.getInt64(WaitMask.value()), PoisonValue::get(Builder.getInt32Ty()), - Builder.CreateZExt(ReturnAddr, Builder.getInt64Ty()), TailArgs); + ContinueOp = Builder.create(RCR, Builder.getInt64(WaitMask.value()), + PoisonValue::get(Builder.getInt32Ty()), RetAddr, TailArgs); ContHelper::removeWaitMask(JumpOp); } else { - ContinueOp = Builder.create(RCR, PoisonValue::get(Builder.getInt32Ty()), - Builder.CreateZExt(ReturnAddr, Builder.getInt64Ty()), TailArgs); + ContinueOp = Builder.create(RCR, PoisonValue::get(Builder.getInt32Ty()), RetAddr, TailArgs); } ContinueOp->copyMetadata(JumpOp); - ContHelper::removeIsWaitAwaitMetadata(*ContinueOp); JumpOp.eraseFromParent(); } +void DXILContPostProcessPassImpl::lowerAsContinuationReferenceOp(lgc::cps::AsContinuationReferenceOp &AsCrOp) { + Builder.SetInsertPoint(&AsCrOp); + Value *AddrWithMD = Builder.CreateCall(getContinuationGetAddrAndMD(*Mod), {AsCrOp.getFn()}); + + if (AsCrOp.getType()->isIntegerTy(32)) { + // If we are using 32-bit compact VPC, extract metadata and encode it into VPC. + auto Vpc = Builder.CreateTruncOrBitCast(AddrWithMD, Builder.getInt32Ty()); + Vpc = Builder.CreateAnd(Vpc, 0xFFFFFFC0); + + // Encode shader priority. + auto RtStage = + lgc::rt::getLgcRtShaderStage(cast(AsCrOp.getFn())).value_or(lgc::rt::RayTracingShaderStage::Count); + + auto GetPriorityFromRtStage = [](lgc::rt::RayTracingShaderStage RtStage) { + // Shader priorities for continuation scheduling. Higher values mean higher scheduling precedence. + // Reserve priority 0 as invalid value. + enum SchedulingPriority : unsigned { + SchedulingPriorityInvalid = 0, + SchedulingPriorityRgs = 1, + SchedulingPriorityChs = 2, + SchedulingPriorityMiss = 2, + SchedulingPriorityTraversal = 3, + SchedulingPriorityAhs = 4, + SchedulingPriorityIs = 5, + SchedulingPriorityCallable = 6, + SchedulingPriorityMaxValid = 7 + }; + switch (RtStage) { + case lgc::rt::RayTracingShaderStage::RayGeneration: + return SchedulingPriorityRgs; + case lgc::rt::RayTracingShaderStage::ClosestHit: + return SchedulingPriorityChs; + case lgc::rt::RayTracingShaderStage::Miss: + return SchedulingPriorityMiss; + case lgc::rt::RayTracingShaderStage::Traversal: + return SchedulingPriorityTraversal; + case lgc::rt::RayTracingShaderStage::AnyHit: + return SchedulingPriorityAhs; + case lgc::rt::RayTracingShaderStage::Intersection: + return SchedulingPriorityIs; + case lgc::rt::RayTracingShaderStage::Callable: + return SchedulingPriorityCallable; + default: + report_fatal_error("Unknown ray tracing shader stage for resume function"); + } + }; + + Vpc = Builder.CreateOr(Vpc, Builder.getInt32(GetPriorityFromRtStage(RtStage))); + AddrWithMD = Vpc; + } + + AsCrOp.replaceAllUsesWith(AddrWithMD); + AsCrOp.eraseFromParent(); +} + bool DXILContPostProcessPassImpl::handleAmdInternals() { bool Changed = false; @@ -633,9 +669,6 @@ PreservedAnalyses DXILContPostProcessPassImpl::run(ModuleAnalysisManager &Analys if (FuncName.starts_with("_AmdGetResumePointAddr")) { Changed = true; lowerGetResumePointAddr(F); - } else if (FuncName.starts_with("_AmdComplete")) { - Changed = true; - llvm::forEachCall(F, [&](llvm::CallInst &CInst) { llvm::terminateShader(Builder, &CInst); }); } } diff --git a/llvmraytracing/lib/LegacyCleanupContinuations.cpp b/llvmraytracing/lib/LegacyCleanupContinuations.cpp index a7ffb57cc7..db36d52c62 100644 --- a/llvmraytracing/lib/LegacyCleanupContinuations.cpp +++ b/llvmraytracing/lib/LegacyCleanupContinuations.cpp @@ -43,6 +43,7 @@ #include "llvm-dialects/Dialect/Builder.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/MathExtras.h" @@ -71,7 +72,6 @@ class LegacyCleanupContinuationsPassImpl { MDNode *MD = nullptr; // The continuation state on the CPS stack Value *NewContState = nullptr; - SmallVector NewReturnContinues; /// Cleaned entry function, used to replace metadata Function *NewStart = nullptr; @@ -88,7 +88,6 @@ class LegacyCleanupContinuationsPassImpl { void handleFunctionEntry(ContinuationData &Data, Function *F, bool IsEntry); void handleContinue(ContinuationData &Data, Instruction *Ret); void handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun); - void handleReturn(ContinuationData &Data, lgc::ilcps::ReturnOp &ContRet); Module &M; LLVMContext &Context; @@ -436,19 +435,64 @@ void LegacyCleanupContinuationsPassImpl::processContinuation(Function *StartFunc ContFrame->replaceAllUsesWith(PoisonValue::get(ContFrame->getType())); } - // Handle the function returns + // Handle the function returns. + // Treat returns and existing jumps separately, since otherwise we could accidentally free. + // returns originate from coro passes, indicating functions ending at suspend points, while + // lgc.cps.complete ends the lane. Leave existing jumps to resume functions as they are. + + // We want to free the continuation stack when we end the original shader with a jump (a), but not at jumps that + // correspond to a suspend point (b). This collects the already existing jumps (a) into the PreExistingJumps vector. + // The jumps that correspond to a suspend point, (b), are introduced when lowering existing return instructions. + // To avoid that we accidentally iterate over these newly introduced jumps, we keep the existing rets (which will be + // translated to (b)) and existing jumps (a) separately. Before this pass, ret instructions mark a suspend point. + // However, after this shader, ret instructions mark the end of the thread. Finally, we have lgc.cps.complete, which + // is used to mark the lane termination, e. g. the end of RGS. These are translated to ret instructions as part of + // this pass. + // Note: Technically, it is not required to free the CPS stack at complete calls, but for consistency reasons, we do + // it anyway. + SmallVector PreExistingRets; + SmallVector PreExistingJumps; for (auto &BB : make_early_inc_range(*NewFunc)) { auto *I = BB.getTerminator(); if (I->getOpcode() == Instruction::Ret) { - handleContinue(FuncData, I); + PreExistingRets.push_back(cast(I)); } else if (I->getOpcode() == Instruction::Unreachable && BB.size() > 1) { - if (auto *Call = dyn_cast(--I->getIterator())) { - if (auto *ContRet = dyn_cast(Call)) - handleReturn(FuncData, *ContRet); + CallInst *PrevInst = cast(&*(--I->getIterator())); + if (auto *Jump = dyn_cast(PrevInst)) { + PreExistingJumps.push_back(Jump); + continue; + } + + // Transform a lane-terminating lgc.cps.complete into a ret instruction. + // If this a non-terminating lgc.cps.jump, this will just free the stack. + if (isa(PrevInst)) { + B.SetInsertPoint(PrevInst); + + uint32_t NeededStackSize = FuncData.getContStateStackBytes(); + if (NeededStackSize > 0) + B.create(B.getInt32(NeededStackSize)); + + llvm::terminateShader(B, PrevInst); + } else { + LLVM_DEBUG(PrevInst->dump()); + llvm_unreachable("Unexpected instruction!"); } } } + // First, handle the pre-existing jumps, (a). + for (auto *Jump : PreExistingJumps) { + B.SetInsertPoint(Jump); + + uint32_t NeededStackSize = FuncData.getContStateStackBytes(); + if (NeededStackSize > 0) + B.create(B.getInt32(NeededStackSize)); + } + + // Then, insert the new jumps for pre-existing returns / suspend points, (b). + for (auto *Ret : PreExistingRets) + handleContinue(FuncData, Ret); + for (auto *I : InstsToRemove) I->eraseFromParent(); @@ -557,21 +601,14 @@ void LegacyCleanupContinuationsPassImpl::handleSingleContinue(ContinuationData & auto *ContinuationReference = B.create(I64, ResumeFun); - bool IsWait = ContHelper::isWaitAwaitCall(*Call); - - // The jump call tail argument list needs to start with the return address. Value *JumpAddr = B.CreatePointerCast(Call->getCalledOperand(), I64); - SmallVector TailArgs{Call->arg_begin() + (IsWait ? 1 : 0), Call->arg_end()}; - TailArgs.insert(TailArgs.begin(), ContinuationReference); + SmallVector TailArgs{Call->args()}; - CallInst *Jump = - B.create(JumpAddr, -1, PoisonValue::get(StructType::get(B.getContext())), TailArgs); + CallInst *Jump = B.create(JumpAddr, -1, PoisonValue::get(StructType::get(B.getContext())), + ContinuationReference, TailArgs); Jump->copyMetadata(*Call); - ContHelper::removeIsWaitAwaitMetadata(*Jump); - if (IsWait) - ContHelper::setWaitMask(*Jump, cast(Call->getArgOperand(0))->getSExtValue()); assert(ContHelper::OutgoingRegisterCount::tryGetValue(Jump) && "Missing registercount metadata!"); // Remove instructions at the end of the block @@ -583,42 +620,6 @@ void LegacyCleanupContinuationsPassImpl::handleSingleContinue(ContinuationData & } } -/// Transform -/// call void (i64, ...) @lgc.ilcps.return(i64 %returnaddr, ) unreachable -/// to -/// -/// call void @lgc.ilcps.continue(i64 %returnaddr, ) -/// unreachable -void LegacyCleanupContinuationsPassImpl::handleReturn(ContinuationData &Data, lgc::ilcps::ReturnOp &ContRet) { - LLVM_DEBUG(dbgs() << "Converting return to continue: " << ContRet << "\n"); - bool IsEntry = isa(ContRet.getReturnAddr()); - B.SetInsertPoint(&ContRet); - - uint32_t NeededStackSize = Data.getContStateStackBytes(); - if (NeededStackSize > 0) - B.create(B.getInt32(NeededStackSize)); - - if (IsEntry) { - assert(ContRet.getArgs().empty() && "Entry functions ignore the return value"); - - llvm::terminateShader(B, &ContRet); - } else { - // Create the call to lgc.ilcps.continue, but with the same argument list - // as for lgc.ilcps.return. The CSP is being set during - // DXILContPostProcess. - // Append the dummy return address as well. - SmallVector RetTail{ContRet.getArgs()}; - auto *ContinueOp = B.create(ContRet.getReturnAddr(), PoisonValue::get(B.getInt32Ty()), - PoisonValue::get(B.getInt64Ty()), RetTail); - Data.NewReturnContinues.push_back(ContinueOp); - - ContinueOp->copyMetadata(ContRet); - assert(ContHelper::OutgoingRegisterCount::tryGetValue(ContinueOp) && "Missing registercount metadata!"); - ContRet.eraseFromParent(); - } -} - LegacyCleanupContinuationsPassImpl::LegacyCleanupContinuationsPassImpl(llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager) : M{Mod}, Context{M.getContext()}, diff --git a/llvmraytracing/lib/LgcCpsJumpInliner.cpp b/llvmraytracing/lib/LgcCpsJumpInliner.cpp index 01dd059f86..e970060385 100644 --- a/llvmraytracing/lib/LgcCpsJumpInliner.cpp +++ b/llvmraytracing/lib/LgcCpsJumpInliner.cpp @@ -100,6 +100,8 @@ PreservedAnalyses LgcCpsJumpInlinerPassImpl::run() { ArgList.push_back(Jump->getState()); } + ArgList.push_back(Jump->getRcr()); + ArgList.append(Jump->getTail().begin(), Jump->getTail().end()); CrossInliner.inlineCall(Builder, JumpTargetFunc, ArgList); diff --git a/llvmraytracing/lib/LgcRtDialect.cpp b/llvmraytracing/lib/LgcRtDialect.cpp index 5495355bd9..377916aa11 100644 --- a/llvmraytracing/lib/LgcRtDialect.cpp +++ b/llvmraytracing/lib/LgcRtDialect.cpp @@ -222,6 +222,13 @@ Constant *lgc::rt::getPaqFromSize(LLVMContext &context, size_t size) { return ConstantArray::get(ArrayType::get(i32Ty, 1), ConstantInt::get(i32Ty, size)); } +// ============================================================================================== +// Get size in bytes from PAQ (payload access qualifier). +// Currently the PAQ is defined as always being a single word giving the size in bytes. +size_t lgc::rt::getSizeFromPaq(Constant *paq) { + return cast(paq->getAggregateElement(0U))->getZExtValue(); +} + // ============================================================================================== // Get arg size (in bytes) metadata for a ray-tracing callable shader function. // We don't allow for the metadata not existing -- that would cause an assert in diff --git a/llvmraytracing/lib/LowerAwait.cpp b/llvmraytracing/lib/LowerAwait.cpp index 36ca5f3f5a..dfae9f65f6 100644 --- a/llvmraytracing/lib/LowerAwait.cpp +++ b/llvmraytracing/lib/LowerAwait.cpp @@ -29,7 +29,7 @@ // a resume point. // // This pass introduces a global for the return address, which is saved at the -// start of a function and used in a `@lgc.ilcps.return(i64)` call in the +// start of a function and used in a `@lgc.cps.jump` call in the // end. // //===----------------------------------------------------------------------===// @@ -106,7 +106,6 @@ void LowerAwaitPassImpl::processContinuations(bool IsLgcCpsMode) { auto &Context = Mod.getContext(); auto *I8Ptr = Type::getInt8Ty(Context)->getPointerTo(); auto *I32 = Type::getInt32Ty(Context); - auto *I64 = Type::getInt64Ty(Context); Type *TokenTy = StructType::create(Context, "continuation.token")->getPointerTo(); @@ -125,9 +124,6 @@ void LowerAwaitPassImpl::processContinuations(bool IsLgcCpsMode) { // Lgc.cps dialect will handle stack pointer and return address in // DXILContPostProcessPass. - bool IsTraversal = lgc::rt::getLgcRtShaderStage(F) == lgc::rt::RayTracingShaderStage::Traversal; - bool IsLegacyNonEntry = !ContHelper::isLegacyEntryFunction(F) && !IsLgcCpsMode && !IsTraversal; - for (auto const &Arg : F->args()) AllArgTypes.push_back(Arg.getType()); @@ -223,45 +219,6 @@ void LowerAwaitPassImpl::processContinuations(bool IsLgcCpsMode) { } CI->eraseFromParent(); } - - // Save the return address at the start of the function for legacy path. - // For lgc.cps, we don't need to save any value, so just not passing any - // argument. - Value *SavedRetAddr = nullptr; - if (!IsLgcCpsMode) { - if (IsLegacyNonEntry) - SavedRetAddr = NewFunc->getArg(0); // Return addr - else - SavedRetAddr = UndefValue::get(I64); - } else { - // We omit the "return address" later, make sure the - // dialects verifier doesn't fail since we disallow `nullptr` arguments - // right now. - SavedRetAddr = PoisonValue::get(I32); - } - - // Convert returns to lgc.ilcps.return calls - for (auto &BB : *NewFunc) { - auto *I = BB.getTerminator(); - if (I->getOpcode() == Instruction::Ret) { - // Replace this instruction with a call to lgc.ilcps.return - B.SetInsertPoint(I); - SmallVector RetVals; - - if (!IsLgcCpsMode) { - if (I->getNumOperands() != 0) - RetVals.push_back(I->getOperand(0)); - } - - auto *ContRetOp = B.create(SavedRetAddr, RetVals); - // DXILCont passes use annotations on the ret to pass information - // on the shader exit to later passes. Copy such metadata to the ContRet - // so later passes can pick it up from there. - ContRetOp->copyMetadata(*I); - B.CreateUnreachable(); - I->eraseFromParent(); - } - } } } diff --git a/llvmraytracing/lib/LowerRayQuery.cpp b/llvmraytracing/lib/LowerRayQuery.cpp index b472b31a32..9df3b4b6f7 100644 --- a/llvmraytracing/lib/LowerRayQuery.cpp +++ b/llvmraytracing/lib/LowerRayQuery.cpp @@ -541,7 +541,14 @@ void LowerRayQuery::visitHitAccessor(GpurtFunc funcType, Value *rayQuery, bool c CrossModuleInliner inliner; auto call = inliner.inlineCall(*m_builder, gpurtFunc, {rayQuery, committedArg}); - inst->replaceAllUsesWith(call.returnValue); + Value *retVal = call.returnValue; + // If the return value is a struct with a single element whose type matches the dialect op's return + // value, then extract the value. + if (auto *structTy = dyn_cast(retVal->getType())) { + if (structTy->getNumElements() == 1 && structTy->getElementType(0) == inst->getType()) + retVal = m_builder->CreateExtractValue(retVal, 0); + } + inst->replaceAllUsesWith(retVal); m_typeLowering->eraseInstruction(inst); m_funcsToLower.insert(inst->getCalledFunction()); } diff --git a/llvmraytracing/lib/LowerRaytracingPipeline.cpp b/llvmraytracing/lib/LowerRaytracingPipeline.cpp index 655f558fc4..f5d0cded8c 100644 --- a/llvmraytracing/lib/LowerRaytracingPipeline.cpp +++ b/llvmraytracing/lib/LowerRaytracingPipeline.cpp @@ -337,68 +337,36 @@ class LowerRaytracingPipelinePassImpl final { // function signatures and continue / jump calls. class PayloadHelper final { public: - PayloadHelper(Module &Mod, const DataLayout &DL, llvm_dialects::Builder &Builder, bool CpsMode) - : Mod{Mod}, DL{DL}, Builder{Builder}, IsCpsMode{CpsMode} {} + PayloadHelper(Module &Mod, const DataLayout &DL, llvm_dialects::Builder &Builder) + : Mod{Mod}, DL{DL}, Builder{Builder} {} /// Append padding and payload to lgc.cps.jump calls. - void patchJumpCalls(Function *Parent, ArrayRef JumpCalls, std::optional PayloadStartDword) { - if (!IsCpsMode || !PayloadStartDword.has_value()) + void patchJumpCalls(Function *Parent, ArrayRef JumpCalls, std::optional PayloadStartDword, + std::optional NumPreservedPayloadDwords, + Value *PayloadSerializationStorage = nullptr) { + if (!PayloadStartDword.has_value()) return; + assert(NumPreservedPayloadDwords.has_value() && + "PayloadHelper::patchJumpCalls: Expected the number of preserved payload dwords to be set!"); + const uint32_t PayloadSize = NumPreservedPayloadDwords.value(); + for (auto *Jump : JumpCalls) { Builder.SetInsertPoint(Jump); - SmallVector NewTailArgs(Jump->getTail()); + + SmallVector NewTailArgs{Jump->getTail()}; // Add padding so that payload starts at a fixed dword. ContHelper::addPaddingValue(DL, Parent->getContext(), NewTailArgs, PayloadStartDword.value()); // Insert payload into tail args. - NewTailArgs.push_back(Parent->getArg(CpsArgIdxPayload)); + NewTailArgs.push_back( + Builder.CreateLoad(ArrayType::get(Builder.getInt32Ty(), PayloadSize), PayloadSerializationStorage)); - Jump->replaceTail(NewTailArgs); + auto *NewJump = Jump->replaceTail(NewTailArgs); + ContHelper::OutgoingRegisterCount::setValue(NewJump, PayloadSize); } } - /// Find a continue call starting from the terminator of a given basic - /// block. - /// Returns a pair containing a pointer to the call, and the iterator range - /// containing the tail argument list used, for computing the padding at the - /// callsite. - std::pair> - getContinueCallFromTerminator(Instruction *Terminator) { - assert((isa(Terminator))); - auto RIt = Terminator->getReverseIterator(); - - // We technically could have an eligible terminator - // as the single instruction of a BB, so we don't want to assert here. - BasicBlock *BB = Terminator->getParent(); - - // Find a continue call starting from the unreachable. - // Don't single-step because at this point the caller - // has created the payload load before the terminator, - // and re-creating the continue call will fix up the order again. - CallInst *CInst = nullptr; - while (RIt != BB->rend()) { - CInst = dyn_cast(&*RIt); - - if (CInst) - break; - - ++RIt; - } - - assert(CInst); - - if (auto *Continue = dyn_cast(CInst)) - return {Continue, Continue->getTail()}; - - if (auto *WaitContinue = dyn_cast(CInst)) - return {WaitContinue, WaitContinue->getTail()}; - - report_fatal_error("LowerRaytracingPipelinePassImpl::PayloadHelper::" - "getContinueCallFromTerminator: expected either a " - "lgc.ilcps.continue or a lgc.ilcps.waitContinue op!"); - } - /// Create and initialize payload serialization storage for non-Traversal /// shader. void initializePayloadSerializationStorage(Function *Parent, FunctionData &Data) { @@ -428,17 +396,16 @@ class LowerRaytracingPipelinePassImpl final { /// Compute the dword at which payload starts in the argument at most in the /// argument list. std::optional getPayloadStartDword(FunctionData &Data, uint32_t MaxHitAttributeBytes, - Type *TraversalDataTy) { + Type *TraversalDataTy, bool CpsMode) { if (Data.PayloadStorageTy->getArrayNumElements() == 0) return std::nullopt; assert(TraversalDataTy && "Failed to detect traversal system data type"); - // For lgc.cps mode, take into account that the return address and shader - // index dwords are inserted at a later stage. + // For lgc.cps mode, take into account the shader index dword is inserted at a later stage. // Always ensure that we consider the two dword barycentric coordinates // passed as argument for _AmdEnqueueAnyHit calls. - return (IsCpsMode ? 1 + 1 : 0) + getArgumentDwordCount(DL, TraversalDataTy) + + return (CpsMode ? 1 : 0) + getArgumentDwordCount(DL, TraversalDataTy) + #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 503627 // Old version of the code std::max(divideCeil(MaxHitAttributeBytes, RegisterBytes), uint64_t(2)); @@ -513,7 +480,6 @@ class LowerRaytracingPipelinePassImpl final { Module &Mod; const DataLayout &DL; llvm_dialects::Builder &Builder; - bool IsCpsMode = false; }; void replaceCall(FunctionData &Data, CallInst *Call, Function *Func, ContinuationCallType CallType); @@ -593,8 +559,8 @@ class LowerRaytracingPipelinePassImpl final { void collectProcessableFunctions(); - Instruction *insertCpsAwait(Type *ReturnTy, Value *ShaderAddr, Instruction *Call, ArrayRef Args, - ContinuationCallType CallType, RayTracingShaderStage ShaderStage); + CallInst *insertCpsAwait(Type *ReturnTy, Value *ShaderAddr, Instruction *Call, ArrayRef Args, + ContinuationCallType CallType, RayTracingShaderStage ShaderStage); MapVector ToProcess; Module *Mod; @@ -614,6 +580,7 @@ class LowerRaytracingPipelinePassImpl final { Type *HitMissDataTy; /// Dispatch system data type passed to RayGen and others Type *DispatchSystemDataTy; + Type *RcrTy; // Function definitions and declarations from HLSL // Driver implementation that returns if AcceptHitAndEndSearch was called @@ -680,9 +647,9 @@ void ModuleMetadataState::updateModuleMetadata() const { } // Create a lgc.cps.await operation for a given shader address. -Instruction *LowerRaytracingPipelinePassImpl::insertCpsAwait(Type *ReturnTy, Value *ShaderAddr, Instruction *Call, - ArrayRef Args, ContinuationCallType CallType, - RayTracingShaderStage ShaderStage) { +CallInst *LowerRaytracingPipelinePassImpl::insertCpsAwait(Type *ReturnTy, Value *ShaderAddr, Instruction *Call, + ArrayRef Args, ContinuationCallType CallType, + RayTracingShaderStage ShaderStage) { Builder.SetInsertPoint(Call); Value *CR = nullptr; @@ -967,20 +934,12 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy bool IsWait = (Call->getCalledFunction()->getName().starts_with("_AmdWaitAwait")); - Value *WaitMask = nullptr; Value *RetAddr = nullptr; if (MetadataState.isInLgcCpsMode()) { // For LgcCps, skip function-addr, the return address will be filled at late // stage of continuation transform. Add shader index so that the callee cps // function get correct shader-index being passed in. - // Append the wait mask to the begin of the tail args. - if (IsWait) { - constexpr static uint32_t WaitMaskIdx = 1; - ArgTys.push_back(FTy->getParamType(WaitMaskIdx)); - Args.push_back(Call->getArgOperand(WaitMaskIdx)); - } - ArgTys.push_back(I32); auto *ShaderIndex = CrossInliner .inlineCall(Builder, GetLocalRootIndex, @@ -997,10 +956,6 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy // padding only on the actual tail arguments, which is the only varying part // of the final continue call at the end. WaitAwaitTraversal calls don't // have a return address, so keep that in mind here. - - if (IsWait) - WaitMask = Call->getArgOperand(1); - uint32_t RetAddrArgIndex = IsWait ? 2 : 1; if (CallType == ContinuationCallType::Traversal) { RetAddr = PoisonValue::get(Builder.getInt64Ty()); @@ -1020,7 +975,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy Args.push_back(HitAttrs); } - Instruction *Annotatable = nullptr; + CallInst *Annotatable = nullptr; Value *NewCall = nullptr; uint32_t OutgoingPayloadDwords = 0; @@ -1038,23 +993,23 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy const bool HasPayload = Data.FirstPayloadArgumentDword.has_value(); // Add padding so that returned payload starts at a fixed dword. - // NOTE: In lgc.cps mode, subtract 1 as return address is not - // included in the returned argument list. if (HasPayload) { - const uint32_t PaddingOffset = IsLgcCpsMode ? 1 : 0; - const auto &[OutgoingPaddingTy, OutgoingPayloadTy] = PayloadHelper.computePaddingAndPayloadArgTys( - ArgTys, OutgoingPayloadDwords, Data.FirstPayloadArgumentDword, PaddingOffset); + const auto &[OutgoingPaddingTy, OutgoingPayloadTy] = + PayloadHelper.computePaddingAndPayloadArgTys(ArgTys, OutgoingPayloadDwords, Data.FirstPayloadArgumentDword); Args.push_back(PoisonValue::get(OutgoingPaddingTy)); Args.push_back(Builder.CreateLoad(OutgoingPayloadTy, Data.PayloadStorage)); } + Value *WaitMask = nullptr; + if (IsWait) + WaitMask = Call->getArgOperand(1); + if (IsLgcCpsMode) { if (HasPayload) { // Compute padding for the resume function so that payload starts at a - // fixed dword. NOTE: Minus 2 as in lgc.cps mode, return address (i32) and - // shader index (i32) are not included. + // fixed dword. NOTE: Minus 1 as in lgc.cps mode, shader index (i32) is not included. PayloadHelper.computePaddingAndPayloadArgTys(ReturnedArgTys, ReturnedRegisterCount.value(), - Data.FirstPayloadArgumentDword, 2); + Data.FirstPayloadArgumentDword, 1); } auto *NewRetTy = StructType::get(Builder.getContext(), ReturnedArgTys); @@ -1073,11 +1028,6 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy ArgTys.insert(ArgTys.begin(), RetAddr->getType()); Args.insert(Args.begin(), RetAddr); - if (WaitMask) { - ArgTys.insert(ArgTys.begin(), WaitMask->getType()); - Args.insert(Args.begin(), WaitMask); - } - auto *ShaderTy = FunctionType::get(TokenTy, ArgTys, false); auto *ShaderFun = Builder.CreateIntToPtr(ShaderAddr, ShaderTy->getPointerTo()); @@ -1094,16 +1044,14 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(ContinuationCallTy Annotatable = Token; } + if (WaitMask) + ContHelper::setWaitMask(*Annotatable, cast(WaitMask)->getSExtValue()); + // Copy back returned payload to the payload serialization alloca as part of // the payload copying. if (HasPayload) Builder.CreateStore(Builder.CreateExtractValue(NewCall, ReturnedArgTys.size() - 1), Data.PayloadStorage); - // For WaitAwait, add metadata indicating that we wait. After coroutine - // passes, we then generate a waitContinue on the awaited function. - if (IsWait) - ContHelper::setIsWaitAwaitCall(*cast(Annotatable)); - ContHelper::ReturnedRegisterCount::setValue(Annotatable, ReturnedRegisterCount.value()); auto OutgoingRegisterCount = std::min(OutgoingSerializationLayout ? OutgoingSerializationLayout->NumStorageI32s @@ -1226,9 +1174,8 @@ void LowerRaytracingPipelinePassImpl::handleGetCurrentFuncAddr(Function &Func) { llvm::forEachCall(Func, [&](llvm::CallInst &CInst) { auto *F = CInst.getFunction(); - auto *RetTy = MetadataState.isInLgcCpsMode() ? Builder.getInt32Ty() : Builder.getInt64Ty(); Builder.SetInsertPoint(&CInst); - Value *AsContRef = Builder.create(RetTy, F); + Value *AsContRef = Builder.create(RcrTy, F); AsContRef = MetadataState.isInLgcCpsMode() ? Builder.CreateZExt(AsContRef, Builder.getInt64Ty()) : AsContRef; CInst.replaceAllUsesWith(AsContRef); CInst.eraseFromParent(); @@ -1477,6 +1424,9 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(FunctionData &Data, Valu } void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() { + if (MetadataState.isInLgcCpsMode()) + return; + // Even if PreservedPayloadRegisterCount is set, there may be // additional shaders in the current module whose usage is recorded // in MaxUsedPayloadRegisterCount, to take the max with it. @@ -1491,7 +1441,7 @@ void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() { static const auto Visitor = llvm_dialects::VisitorBuilder() - .addSet([](VisitorState &State, Instruction &Op) { + .addSet([](VisitorState &State, Instruction &Op) { uint32_t InRegisterCount = 0; uint32_t OutRegisterCount = 0; auto *CallerFunc = Op.getFunction(); @@ -1647,23 +1597,22 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, Fun if (ExitRayGen) handleExitRayGen(Data); - Builder.CreateRetVoid(); + Builder.create(); + Builder.CreateUnreachable(); EData.Terminator->eraseFromParent(); return; } - const bool IsTraversal = Data.Kind == RayTracingShaderStage::Traversal; SmallVector PaddingArgs; + SmallVector TailArgList; if (MetadataState.isInLgcCpsMode()) { - // Jump to resume point of caller, pass Poison Rcr and ShaderIndex as they - // are not meaningful for the case. - PaddingArgs.append({PoisonValue::get(I32), PoisonValue::get(I32)}); + // Jump to resume point of caller, pass Poison ShaderIndex as it is not meaningful here. + PaddingArgs.push_back(PoisonValue::get(I32)); } Function *Parent = EData.Terminator->getFunction(); - SmallVector TailArgList; unsigned OutgoingRegisterCount = 0; // For Traversal and Intersection, only pass through the payload registers // after reading them back from the serialization alloca. @@ -1676,83 +1625,30 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(FunctionData &Data, Fun std::min(EData.OutgoingSerializationLayout->NumStorageI32s, MetadataState.getMaxPayloadRegisterCount()); } - Instruction *Ret = nullptr; - if (MetadataState.isInLgcCpsMode()) { - if (RetValue) - PaddingArgs.push_back(RetValue); - - // Construct the tail argument list and append the padding and payload - // values. - TailArgList.append(PaddingArgs); - PayloadHelper.appendPaddingAndPayloadValues(PaddingArgs, TailArgList, OutgoingRegisterCount, - Data.FirstPayloadArgumentDword, Data.PayloadStorage); - - Ret = Builder.create(Parent->getArg(CpsArgIdxReturnAddr), getPotentialCpsReturnLevels(Data.Kind), - PoisonValue::get(StructType::get(Builder.getContext())), TailArgList); - Builder.CreateUnreachable(); - EData.Terminator->eraseFromParent(); - } else if (IsTraversal) { - // TODO: For Traversal, we already have continue calls from the - // IntrinsicPrepare pass. So, we only want to include padding and payload - // for these existing calls. - auto [ContinueCall, ItRange] = PayloadHelper.getContinueCallFromTerminator(EData.Terminator); - - PaddingArgs.append(ItRange.begin(), ItRange.end()); - TailArgList.append(PaddingArgs); - - PayloadHelper.appendPaddingAndPayloadValues(PaddingArgs, TailArgList, OutgoingRegisterCount, - Data.FirstPayloadArgumentDword, Data.PayloadStorage); - - Builder.SetInsertPoint(EData.Terminator); - - // Create a lgc.cps.jump call with all arguments including the padding and the - // payload. - Value *ReturnAddr = nullptr; - Value *WaitMask = nullptr; - if (auto *WaitContinue = dyn_cast(ContinueCall)) { - WaitMask = WaitContinue->getWaitMask(); - ReturnAddr = WaitContinue->getReturnAddr(); - } else if (auto *Continue = dyn_cast(ContinueCall)) { - ReturnAddr = Continue->getReturnAddr(); - } - - assert(ReturnAddr); - - TailArgList.insert(TailArgList.begin(), ReturnAddr); - CallInst *NewCall = Builder.create( - ContinueCall->getArgOperand(0), -1, PoisonValue::get(StructType::get(ContinueCall->getContext())), TailArgList); + Value *ReturnAddr = Parent->getArg(MetadataState.isInLgcCpsMode() ? CpsArgIdxReturnAddr : 0); + const uint32_t Levels = MetadataState.isInLgcCpsMode() ? getPotentialCpsReturnLevels(Data.Kind) : -1; - NewCall->copyMetadata(*ContinueCall); + if (RetValue) + PaddingArgs.push_back(RetValue); - if (WaitMask) - ContHelper::setWaitMask(*NewCall, cast(WaitMask)->getZExtValue()); + // Construct the tail argument list and append the padding and payload + // values. + TailArgList.append(PaddingArgs); + PayloadHelper.appendPaddingAndPayloadValues(PaddingArgs, TailArgList, OutgoingRegisterCount, + Data.FirstPayloadArgumentDword, Data.PayloadStorage); - ContinueCall->eraseFromParent(); - } else { - if (RetValue) - PaddingArgs.push_back(RetValue); - - PayloadHelper.appendPaddingAndPayloadValues(PaddingArgs, TailArgList, OutgoingRegisterCount, - Data.FirstPayloadArgumentDword, Data.PayloadStorage); - - // Include the return value (it was already included in the PaddingArgs - // set itself). - if (RetValue) - TailArgList.insert(TailArgList.begin(), RetValue); - Ret = Builder.create(Parent->getArg(0), TailArgList); - Builder.CreateUnreachable(); - - EData.Terminator->eraseFromParent(); - } + Instruction *Jump = + Builder.create(ReturnAddr, Levels, PoisonValue::get(StructType::get(Builder.getContext())), + PoisonValue::get(RcrTy), TailArgList); + Builder.CreateUnreachable(); + EData.Terminator->eraseFromParent(); // Annotate the terminator with number of outgoing payload registers. // This annotation will be passed along the following transformations, // ending up at the final continuation call. - if (Ret) { - ContHelper::OutgoingRegisterCount::setValue(Ret, OutgoingRegisterCount); - if (EData.OutgoingSerializationLayout) - MetadataState.updateMaxUsedPayloadRegisterCount(OutgoingRegisterCount); - } + ContHelper::OutgoingRegisterCount::setValue(Jump, OutgoingRegisterCount); + if (EData.OutgoingSerializationLayout) + MetadataState.updateMaxUsedPayloadRegisterCount(OutgoingRegisterCount); } void LowerRaytracingPipelinePassImpl::handleExitRayGen(const FunctionData &Data) { @@ -1784,6 +1680,7 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData uint32_t SystemDataArgumentIndex = 0; + // We always have a return address argument, which must not be included in the padding computation. if (MetadataState.isInLgcCpsMode()) { // Create the CPS function header. @@ -1800,12 +1697,10 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData AllArgTypes.push_back(StructType::get(Mod->getContext())); AllArgTypes.push_back(Builder.getInt32Ty()); - AllArgTypes.push_back(Builder.getInt32Ty()); SystemDataArgumentIndex = 3; } else { - // For non-lgc.cps mode, we always have a return address argument, which - // must not be included in the padding computation. The overall layout is: + // The overall layout is: // | returnAddr | systemData | (hitAttrs, remaining args) | padding | // payload // If we don't pass payload, then for stability reasons, we still pass in a @@ -1888,8 +1783,8 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData assert(NumIncomingPayloadDwords.has_value()); Data.PayloadStorageTy = PayloadHelper.getPayloadStorageTy(MetadataState.getMaxPayloadRegisterCount(), Data); - Data.FirstPayloadArgumentDword = - PayloadHelper.getPayloadStartDword(Data, MetadataState.getMaxHitAttributeByteCount(), TraversalDataTy); + Data.FirstPayloadArgumentDword = PayloadHelper.getPayloadStartDword(Data, MetadataState.getMaxHitAttributeByteCount(), + TraversalDataTy, MetadataState.isInLgcCpsMode()); const bool HasPayloadArgument = Data.Kind != RayTracingShaderStage::RayGeneration; if (HasPayloadArgument) { @@ -1904,8 +1799,11 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData } // Pass in the return address argument - if (!MetadataState.isInLgcCpsMode()) - AllArgTypes.insert(AllArgTypes.begin(), Builder.getInt64Ty()); + { + const uint32_t RetAddrSize = MetadataState.isInLgcCpsMode() ? 32 : 64; + const uint32_t RetAddrPos = MetadataState.isInLgcCpsMode() ? 1 : 0; + AllArgTypes.insert(AllArgTypes.begin() + RetAddrPos, Builder.getIntNTy(RetAddrSize)); + } Data.PayloadSpillSize = computePayloadSpillSize(Data.MaxOutgoingPayloadI32s, MetadataState.getMaxPayloadRegisterCount()); @@ -1950,29 +1848,29 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData Value *NewSystemData = nullptr; const bool IsTraversal = Data.Kind == RayTracingShaderStage::Traversal; - if (IsTraversal && MetadataState.isInLgcCpsMode()) { + if (IsTraversal) { assert(F->arg_size() == 1); - // System data - // NOTE: Pointer address space may not match based on data layout, mutate - // the address space here to keep later GEP valid. - Data.SystemData->mutateType( - getWithSamePointeeType(Data.SystemData->getType(), F->getArg(0)->getType()->getPointerAddressSpace())); - NewSystemData = Data.SystemData; - } else { - PayloadHelper.initializePayloadSerializationStorage(NewFunc, Data); - - if (auto *ContPayloadRegistersGetI32 = Mod->getFunction("_AmdContPayloadRegistersGetI32")) - handleContPayloadRegistersGetI32(*ContPayloadRegistersGetI32, *NewFunc, Data); - - if (auto *ContPayloadRegistersSetI32 = Mod->getFunction("_AmdContPayloadRegistersSetI32")) - handleContPayloadRegistersSetI32(*ContPayloadRegistersSetI32, *NewFunc, Data); - - if (IsTraversal) { + if (MetadataState.isInLgcCpsMode()) { + // System data + // NOTE: Pointer address space may not match based on data layout, mutate + // the address space here to keep later GEP valid. + Data.SystemData->mutateType( + getWithSamePointeeType(Data.SystemData->getType(), F->getArg(0)->getType()->getPointerAddressSpace())); + NewSystemData = Data.SystemData; + } else { // Replace old system data argument with cloned functions' argument NewSystemData = NewFunc->getArg(1); } } + PayloadHelper.initializePayloadSerializationStorage(NewFunc, Data); + + if (auto *ContPayloadRegistersGetI32 = Mod->getFunction("_AmdContPayloadRegistersGetI32")) + handleContPayloadRegistersGetI32(*ContPayloadRegistersGetI32, *NewFunc, Data); + + if (auto *ContPayloadRegistersSetI32 = Mod->getFunction("_AmdContPayloadRegistersSetI32")) + handleContPayloadRegistersSetI32(*ContPayloadRegistersSetI32, *NewFunc, Data); + if (NewSystemData) F->getArg(0)->replaceAllUsesWith(NewSystemData); @@ -2134,8 +2032,9 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F, FunctionData // Modify function ends // While iterating over function ends, basic blocks are inserted by inlining // functions, so we copy them beforehand. - if (MetadataState.isInLgcCpsMode() && Data.Kind == RayTracingShaderStage::Traversal) { - PayloadHelper.patchJumpCalls(NewFunc, Data.JumpCalls, Data.FirstPayloadArgumentDword); + if (Data.Kind == RayTracingShaderStage::Traversal) { + PayloadHelper.patchJumpCalls(NewFunc, Data.JumpCalls, Data.FirstPayloadArgumentDword, + Data.NumPassedThroughPayloadDwords, Data.PayloadStorage); } else { SmallVector BBs(make_pointer_range(*NewFunc)); for (auto *BB : BBs) { @@ -2464,9 +2363,9 @@ void LowerRaytracingPipelinePassImpl::collectGpuRtFunctions() { LowerRaytracingPipelinePassImpl::LowerRaytracingPipelinePassImpl(llvm::Module &M, Module &GpurtLibrary) : Mod{&M}, GpurtLibrary{&GpurtLibrary}, Context{&M.getContext()}, DL{&M.getDataLayout()}, - Builder{Mod->getContext()}, MetadataState{*Mod}, PAQManager{Mod, &GpurtLibrary, - MetadataState.getMaxPayloadRegisterCount()}, - PayloadHelper{*Mod, *DL, Builder, MetadataState.isInLgcCpsMode()} { + Builder{Mod->getContext()}, MetadataState{*Mod}, + PAQManager{Mod, &GpurtLibrary, MetadataState.getMaxPayloadRegisterCount()}, PayloadHelper{*Mod, *DL, Builder} { + RcrTy = MetadataState.isInLgcCpsMode() ? Builder.getInt32Ty() : Builder.getInt64Ty(); } PreservedAnalyses LowerRaytracingPipelinePassImpl::run() { diff --git a/llvmraytracing/plugin/Plugin.cpp b/llvmraytracing/plugin/Plugin.cpp index 01f5d3c91b..343741e354 100644 --- a/llvmraytracing/plugin/Plugin.cpp +++ b/llvmraytracing/plugin/Plugin.cpp @@ -38,9 +38,3 @@ llvm::PassPluginLibraryInfo getRaytracingPluginPluginInfo() { return {LLVM_PLUGIN_API_VERSION, "Raytracing", LLVM_VERSION_STRING, [](llvm::PassBuilder &PB) { llvm::ContHelper::RegisterPasses(PB, true); }}; } - -#ifndef LLVM_RAYTRACINGPLUGIN_LINK_INTO_TOOLS -extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo llvmGetPassPluginInfo() { - return getRaytracingPluginPluginInfo(); -} -#endif diff --git a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll b/llvmraytracing/test/dx/cleanup-continuations-malloc.ll index 87993ba422..9530ebd768 100644 --- a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll +++ b/llvmraytracing/test/dx/cleanup-continuations-malloc.ll @@ -7,6 +7,8 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: declare void @await.void(%continuation.token*) declare %continuation.token* @async_fun() +declare void @lgc.cps.jump(...) +declare void @lgc.cps.complete() define <4 x i32> @simple_await(i64 %dummyRet, <4 x i32> %arg) !continuation.registercount !1 { ; CHECK-LABEL: define void @simple_await( @@ -15,15 +17,16 @@ define <4 x i32> @simple_await(i64 %dummyRet, <4 x i32> %arg) !continuation.regi ; CHECK-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 24) ; CHECK-NEXT: [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 ; CHECK-NEXT: store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4 -; CHECK-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1 -; CHECK-NEXT: store i64 [[DUMMYRET]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4 +; CHECK-NEXT: [[DUMMYRET_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1 +; CHECK-NEXT: store i64 [[DUMMYRET]], ptr addrspace(32) [[DUMMYRET_SPILL_ADDR]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0) ; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; CHECK-NEXT: unreachable ; %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) - ret <4 x i32> %arg, !continuation.registercount !1 + call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i64 poison, <4 x i32> %arg), !continuation.registercount !1 + unreachable } define void @simple_await_entry(i64 %dummyRet, <4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 { @@ -42,7 +45,8 @@ define void @simple_await_entry(i64 %dummyRet, <4 x i32> %arg, <4 x i32> addrspa %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) store <4 x i32> %arg, <4 x i32> addrspace(1)* %mem - ret void, !continuation.registercount !1 + call void @lgc.cps.complete(), !continuation.registercount !1 + unreachable } !continuation.stackAddrspace = !{!2} diff --git a/llvmraytracing/test/dx/cleanup-continuations.ll b/llvmraytracing/test/dx/cleanup-continuations.ll index 838de9210f..9ddee2abb0 100644 --- a/llvmraytracing/test/dx/cleanup-continuations.ll +++ b/llvmraytracing/test/dx/cleanup-continuations.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3 -; RUN: opt --verify-each -passes='legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck %s +; RUN: opt --verify-each -passes='legacy-cleanup-continuations,lint,continuations-lint' -S %s --lint-abort-on-error | FileCheck %s target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32" @@ -11,7 +11,8 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: declare %continuation.token* @async_fun() declare i32 @lgc.ilcps.getReturnValue__i32() #0 -declare void @lgc.ilcps.return(i64, ...) +declare void @lgc.cps.complete() +declare void @lgc.cps.jump(...) define { i8*, %continuation.token* } @simple_await(i64 %dummyRet, i8* %0) !continuation !0 !continuation.registercount !4 { ; CHECK-LABEL: define void @simple_await( @@ -44,7 +45,7 @@ define internal { i8*, %continuation.token* } @simple_await.resume.0(i8* noalias ; CHECK-NEXT: [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0 ; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i64, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4 ; CHECK-NEXT: call void @lgc.cps.free(i32 8) -; CHECK-NEXT: call void (...) @lgc.ilcps.continue(i64 [[DOTRELOAD]], i32 poison, i64 poison, i64 undef), !continuation.registercount [[META2]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[DOTRELOAD]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META2]] ; CHECK-NEXT: unreachable ; entryresume.0: @@ -52,7 +53,7 @@ entryresume.0: %vFrame = bitcast %simple_await.Frame* %FramePtr to i8* %.reload.addr = getelementptr inbounds %simple_await.Frame, %simple_await.Frame* %FramePtr, i32 0, i32 0 %.reload = load i64, i64* %.reload.addr, align 4 - call void (i64, ...) @lgc.ilcps.return(i64 %.reload, i64 undef), !continuation.registercount !4 + call void (...) @lgc.cps.jump(i64 %.reload, i32 -1, {} poison, i64 poison), !continuation.registercount !4 unreachable } @@ -90,7 +91,7 @@ define internal { i8*, %continuation.token* } @simple_await_entry.resume.0(i8* n entryresume.0: %FramePtr = bitcast i8* %0 to %simple_await_entry.Frame* %vFrame = bitcast %simple_await_entry.Frame* %FramePtr to i8* - call void (i64, ...) @lgc.ilcps.return(i64 undef), !continuation.registercount !4 + call void @lgc.cps.complete(), !continuation.registercount !4 unreachable } @@ -122,7 +123,7 @@ define internal { i8*, %continuation.token* } @await_with_ret_value.resume.0(i8* ; CHECK-NEXT: [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0 ; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i64, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4 ; CHECK-NEXT: call void @lgc.cps.free(i32 8) -; CHECK-NEXT: call void (...) @lgc.ilcps.continue(i64 [[DOTRELOAD]], i32 poison, i64 poison, i32 [[RES1]], i64 undef), !continuation.registercount [[META2]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[DOTRELOAD]], i32 -1, {} poison, i64 poison, i32 [[RES1]]), !continuation.registercount [[META2]] ; CHECK-NEXT: unreachable ; %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame* @@ -130,7 +131,7 @@ define internal { i8*, %continuation.token* } @await_with_ret_value.resume.0(i8* %.reload.addr = getelementptr inbounds %await_with_ret_value.Frame, %await_with_ret_value.Frame* %FramePtr, i32 0, i32 0 %.reload = load i64, i64* %.reload.addr, align 4 %res = call i32 @lgc.ilcps.getReturnValue__i32() - call void (i64, ...) @lgc.ilcps.return(i64 %.reload, i32 %res, i64 undef), !continuation.registercount !4 + call void (...) @lgc.cps.jump(i64 %.reload, i32 -1, {} poison, i64 poison, i32 %res), !continuation.registercount !4 unreachable } @@ -153,7 +154,7 @@ define { i8*, %continuation.token* } @switch_case_unreachable(i64 %dummyRet, i8* ; CHECK-NEXT: br label [[A]] ; CHECK: a: ; CHECK-NEXT: call void @lgc.cps.free(i32 8) -; CHECK-NEXT: call void (...) @lgc.ilcps.continue(i64 [[DUMMYRET]], i32 poison, i64 poison, i32 5, i64 undef), !continuation.registercount [[META2]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[DUMMYRET]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META2]] ; CHECK-NEXT: unreachable ; %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame* @@ -172,7 +173,7 @@ b: br label %a a: - call void (i64, ...) @lgc.ilcps.return(i64 %dummyRet, i32 5, i64 undef), !continuation.registercount !4 + call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i64 poison), !continuation.registercount !4 unreachable } @@ -193,7 +194,7 @@ define { i8*, %continuation.token* } @phi_of_cont_state(i64 %dummyRet, ptr %Fram ; CHECK-NEXT: [[C:%.*]] = phi ptr addrspace(32) [ [[A]], [[LA]] ], [ [[B]], [[LB]] ] ; CHECK-NEXT: store i64 -1, ptr addrspace(32) [[C]], align 4 ; CHECK-NEXT: call void @lgc.cps.free(i32 8) -; CHECK-NEXT: call void (...) @lgc.ilcps.continue(i64 [[DUMMYRET]], i32 poison, i64 poison, i32 5, i64 undef), !continuation.registercount [[META2]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[DUMMYRET]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META2]] ; CHECK-NEXT: unreachable ; %cond = trunc i64 %dummyRet to i1 @@ -210,7 +211,7 @@ lb: end: %c = phi ptr [ %a, %la ], [ %b, %lb ] store i64 -1, ptr %c, align 4 - call void (i64, ...) @lgc.ilcps.return(i64 %dummyRet, i32 5, i64 undef), !continuation.registercount !4 + call void (...) @lgc.cps.jump(i64 %dummyRet, i32 -1, {} poison, i64 poison), !continuation.registercount !4 unreachable } @@ -230,9 +231,7 @@ attributes #0 = { nounwind } ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: readwrite) } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn } -; CHECK: attributes #[[ATTR3:[0-9]+]] = { noreturn } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: read) } -; CHECK: attributes #[[ATTR5:[0-9]+]] = { noreturn nounwind } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: read) } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 21} ; CHECK: [[META1]] = !{ptr @simple_await} diff --git a/llvmraytracing/test/dx/closest-hit-procedural.ll b/llvmraytracing/test/dx/closest-hit-procedural.ll index d719f40f40..1ba48b4475 100644 --- a/llvmraytracing/test/dx/closest-hit-procedural.ll +++ b/llvmraytracing/test/dx/closest-hit-procedural.ll @@ -140,7 +140,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_3_CLOSESTHIT_IN_PAYLOAD_ATTR_2_I32S:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP2]], align 4 @@ -175,7 +175,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP38]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP39]], [19 x i32] poison, [10 x i32] [[TMP40]]), !continuation.registercount [[META16]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP39]], [19 x i32] poison, [10 x i32] [[TMP40]]), !continuation.registercount [[META16]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit( @@ -198,7 +198,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP2]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTSROA_09_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP3:%.*]] = bitcast float [[DOTSROA_09_0_VEC_EXTRACT]] to i32 diff --git a/llvmraytracing/test/dx/closest-hit-traceray.ll b/llvmraytracing/test/dx/closest-hit-traceray.ll index b31ab7befd..ab9465a7a4 100644 --- a/llvmraytracing/test/dx/closest-hit-traceray.ll +++ b/llvmraytracing/test/dx/closest-hit-traceray.ll @@ -137,7 +137,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP19]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[HITATTRS]], align 4 @@ -153,14 +153,14 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP32]]) #[[ATTR10:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP31]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP33]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP29]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP29]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP35]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[NEWDATA_I:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AMD_DX_TRAVERSAL:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[NEWDATA_I:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[AMD_DX_TRAVERSAL:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA_I]], ptr [[TMP37]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4 @@ -180,7 +180,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP43]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP50:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP44]], [19 x i32] poison, [10 x i32] [[TMP50]]), !continuation.registercount [[META16]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP44]], [19 x i32] poison, [10 x i32] [[TMP50]]), !continuation.registercount [[META16]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit( @@ -203,7 +203,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP2]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTSROA_08_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP3:%.*]] = bitcast float [[DOTSROA_08_0_VEC_EXTRACT]] to i32 @@ -212,8 +212,8 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; DXILCONTPOSTPROCESS-NEXT: [[TMP5:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP6:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; DXILCONTPOSTPROCESS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP5]]) -; DXILCONTPOSTPROCESS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; DXILCONTPOSTPROCESS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP5]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; DXILCONTPOSTPROCESS-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; DXILCONTPOSTPROCESS-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[DIS_DATA_I_FCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP10]], i32 0, i32 0 @@ -221,7 +221,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DIS_DATA_I_FCA_0_LOAD]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 -; DXILCONTPOSTPROCESS-NEXT: [[NEWDATA_I:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AMD_DX_TRAVERSAL:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]) +; DXILCONTPOSTPROCESS-NEXT: [[NEWDATA_I:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[AMD_DX_TRAVERSAL:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]) ; DXILCONTPOSTPROCESS-NEXT: [[NEWDATA_I_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA_I]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[NEWDATA_I_FCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP10]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store <3 x i32> [[NEWDATA_I_FCA_0_EXTRACT]], ptr [[NEWDATA_I_FCA_0_GEP]], align 4 diff --git a/llvmraytracing/test/dx/closest-hit.ll b/llvmraytracing/test/dx/closest-hit.ll index fe1e19f8f4..593fa611a7 100644 --- a/llvmraytracing/test/dx/closest-hit.ll +++ b/llvmraytracing/test/dx/closest-hit.ll @@ -117,7 +117,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP11]], ptr [[TMP9]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP10]], ptr [[HITATTRS]], align 4 @@ -140,7 +140,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP26]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [19 x i32] poison, [8 x i32] [[TMP24]]), !continuation.registercount [[META10]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [19 x i32] poison, [8 x i32] [[TMP24]]), !continuation.registercount [[META10]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; %ptr = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0 diff --git a/llvmraytracing/test/dx/continuation-registercount.ll b/llvmraytracing/test/dx/continuation-registercount.ll index a972f1e5a1..5d0bdefd4e 100644 --- a/llvmraytracing/test/dx/continuation-registercount.ll +++ b/llvmraytracing/test/dx/continuation-registercount.ll @@ -1,9 +1,9 @@ ; RUN: grep -v MAX_REG_10 %s | \ -; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | \ +; RUN: opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \ ; RUN: FileCheck -check-prefixes=COMMON,MAX30 %s ; ; RUN: grep -v MAX_REG_30 %s | \ -; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | \ +; RUN: opt --verify-each --report-payload-register-sizes=byjump -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-stats-report,remove-types-metadata' -S --lint-abort-on-error 2>&1 | \ ; RUN: FileCheck -check-prefixes=COMMON,MAX10 %s ; The order of metadata on functions is non-deterministic, so make two different runs to match both of them. @@ -117,8 +117,9 @@ define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hi ret i1 true } -; COMMON-DAG: define void @main( -; COMMON-DAG: call void (...) @lgc.cps.jump(i64 2, {{.*}} %struct.DispatchSystemData %{{.*}}, [10 x i32] %{{.*}}) +; COMMON-DAG: Incoming payload VGPR size of "main" (raygeneration): 0 dwords +; COMMON-DAG: Outgoing payload VGPR size by jump: +; COMMON-DAG: call void (...) @lgc.cps.jump(i64 2, {{.*}} %struct.DispatchSystemData %{{.*}}: 10 dwords define void @main() { %params = alloca %struct.TheirParams, align 4 @@ -126,9 +127,10 @@ define void @main() { ret void } -; COMMON-DAG: define void @mainTrace( -; MAX10-DAG: call void (...) @lgc.cps.jump(i64 4, {{.*}} %struct.TraversalData %{{.*}}, [10 x i32] %{{.*}}) -; MAX30-DAG: call void (...) @lgc.cps.jump(i64 4, {{.*}} %struct.TraversalData %{{.*}}, [15 x i32] %{{.*}}) +; COMMON-DAG: Incoming payload VGPR size of "mainTrace" (raygeneration): 0 dwords +; COMMON-DAG: Outgoing payload VGPR size by jump: +; MAX10-DAG: call void (...) @lgc.cps.jump(i64 4, {{.*}} %struct.TraversalData %{{.*}}: 10 dwords +; MAX30-DAG: call void (...) @lgc.cps.jump(i64 4, {{.*}} %struct.TraversalData %{{.*}}: 15 dwords define void @mainTrace() { %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 @@ -141,10 +143,10 @@ define void @mainTrace() { } ; If we set maxPayloadRegisterCount to 10, both functions use only 10 payload registers. -; MAX10-DAG: define void @called({{.*}}%struct.DispatchSystemData %0{{.*}}, [10 x i32] %payload) -; MAX10-DAG: define dso_local void @called.resume.0({{.*}}%struct.DispatchSystemData{{.*}}, [10 x i32] }{{.*}}) -; MAX30-DAG: define void @called({{.*}}%struct.DispatchSystemData %0{{.*}}, [26 x i32] %payload) -; MAX30-DAG: define dso_local void @called.resume.0({{.*}}%struct.DispatchSystemData{{.*}}, [27 x i32] }{{.*}}) +; MAX10-DAG: Incoming payload VGPR size of "called" (callable): 10 dwords +; MAX10-DAG: Incoming payload VGPR size of "called.resume.0" (callable): 10 dwords +; MAX30-DAG: Incoming payload VGPR size of "called" (callable): 26 dwords +; MAX30-DAG: Incoming payload VGPR size of "called.resume.0" (callable): 27 dwords define void @called(%struct.MyParams* %arg) !pointeetys !39 { %params = alloca %struct.TheirParams2, align 4 @@ -152,12 +154,13 @@ define void @called(%struct.MyParams* %arg) !pointeetys !39 { ret void } -; MAX10-DAG: define void @Intersection({{.*}}%struct.AnyHitTraversalData %0{{.*}}, [10 x i32] %payload) -; MAX10-DAG: define dso_local void @Intersection.resume.0({{.*}}%struct.AnyHitTraversalData{{.*}}, [10 x i32] }{{.*}}) -; MAX10-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}, [10 x i32] %{{.*}}) -; MAX30-DAG: define void @Intersection({{.*}}%struct.AnyHitTraversalData %0{{.*}}, [30 x i32] %payload) -; MAX30-DAG: define dso_local void @Intersection.resume.0({{.*}}%struct.AnyHitTraversalData{{.*}}, [30 x i32] }{{.*}}) -; MAX30-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}, [30 x i32] %{{.*}}) +; MAX10-DAG: Incoming payload VGPR size of "Intersection" (intersection): 10 dwords +; MAX10-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 10 dwords +; COMMON-DAG: Outgoing payload VGPR size by jump: +; MAX10-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}: 10 dwords +; MAX30-DAG: Incoming payload VGPR size of "Intersection.resume.0" (intersection): 30 dwords +; COMMON-DAG: Outgoing payload VGPR size by jump: +; MAX30-DAG: call void (...) @lgc.cps.jump(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}: 30 dwords define void @Intersection() #3 { %a = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4 @@ -165,32 +168,34 @@ define void @Intersection() #3 { ret void } -; MAX10-DAG: define void @AnyHit({{.*}}%struct.AnyHitTraversalData %0, %struct.BuiltInTriangleIntersectionAttributes %1{{.*}}, [10 x i32] %payload) -; MAX30-DAG: define void @AnyHit({{.*}}%struct.AnyHitTraversalData %0, %struct.BuiltInTriangleIntersectionAttributes %1{{.*}}, [15 x i32] %payload) +; MAX10-DAG: Incoming payload VGPR size of "AnyHit" (anyhit): 10 dwords +; MAX30-DAG: Incoming payload VGPR size of "AnyHit" (anyhit): 15 dwords define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !pointeetys !41 { ret void } ; With fixed hit attribute registers and without PAQs, ClosestHitOut also contains storage for hit attributes -; MAX10-DAG: define void @ClosestHit({{.*}}%struct.SystemData %0{{.*}}, [10 x i32] %payload) -; MAX30-DAG: define void @ClosestHit({{.*}}%struct.SystemData %0{{.*}}, [15 x i32] %payload) +; MAX10-DAG: Incoming payload VGPR size of "ClosestHit" (closesthit): 10 dwords +; MAX30-DAG: Incoming payload VGPR size of "ClosestHit" (closesthit): 15 dwords define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.AnyHitTraversalData* nocapture readonly %attr) #3 !pointeetys !41 { ret void } -; COMMON-DAG: define void @Miss16({{.*}}%struct.SystemData %0{{.*}}, [1 x i32] %payload) +; COMMON-DAG: Incoming payload VGPR size of "Miss16" (miss): 1 dwords define void @Miss16(%struct.PayloadWithI16* noalias nocapture %payload) !pointeetys !55 { ret void } declare void @_AmdEnqueueAnyHit(i64, i64, %struct._AmdSystemData, <2 x float>) #0 -; MAX10-DAG: define void @_cont_Traversal({{.*}}, [10 x i32] %payload) -; MAX10-DAG: call {{.*}} @lgc.cps.jump({{.*}}, [10 x i32] %{{.*}}) -; MAX30-DAG: define void @_cont_Traversal({{.*}}, [27 x i32] %payload) -; MAX30-DAG: call {{.*}} @lgc.cps.jump({{.*}}, [27 x i32] %{{.*}}) +; MAX10-DAG: Incoming payload VGPR size of "_cont_Traversal" (compute): 10 dwords +; COMMON-DAG: Outgoing payload VGPR size by jump: +; MAX10-DAG: call {{.*}} @lgc.cps.jump({{.*}}: 10 dwords +; MAX30-DAG: Incoming payload VGPR size of "_cont_Traversal" (compute): 27 dwords +; COMMON-DAG: Outgoing payload VGPR size by jump: +; MAX30-DAG: call {{.*}} @lgc.cps.jump({{.*}}: 27 dwords define void @_cont_Traversal(%struct._AmdTraversalResultData* noalias nocapture sret(%struct._AmdTraversalResultData) %agg.result, %struct._AmdSystemData* noalias %data) !pointeetys !44 { call void @_AmdEnqueueAnyHit(i64 0, i64 poison, %struct.BuiltInTriangleIntersectionAttributes undef, <2 x float> undef) diff --git a/llvmraytracing/test/dx/continuation-state.ll b/llvmraytracing/test/dx/continuation-state.ll index 34a1082bea..fdff6bdc3f 100644 --- a/llvmraytracing/test/dx/continuation-state.ll +++ b/llvmraytracing/test/dx/continuation-state.ll @@ -8,18 +8,22 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: declare void @await.void(%continuation.token*) declare i32 @_cont_GetContinuationStackAddr() declare %continuation.token* @async_fun() +declare void @lgc.cps.jump(...) +declare void @lgc.cps.complete() define <4 x i32> @simple_await(i64 %returnAddr, <4 x i32> %arg) !continuation.registercount !1 { %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) - ret <4 x i32> %arg, !continuation.registercount !1 + call void (...) @lgc.cps.jump(i64 %returnAddr, i32 -1, i64 poison, i64 poison, <4 x i32> %arg), !continuation.registercount !1 + unreachable } define void @simple_await_entry(i64 %returnAddr, <4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 { %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) store <4 x i32> %arg, <4 x i32> addrspace(1)* %mem - ret void, !continuation.registercount !1 + call void @lgc.cps.complete(), !continuation.registercount !1 + unreachable } !continuation.maxPayloadRegisterCount = !{!2} @@ -51,7 +55,7 @@ define void @simple_await_entry(i64 %returnAddr, <4 x i32> %arg, <4 x i32> addrs ; CLEANUP-NEXT: [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1 ; CLEANUP-NEXT: [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4 ; CLEANUP-NEXT: call void @lgc.cps.free(i32 24) -; CLEANUP-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 poison, i64 poison, <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, i64 poison, i64 poison, <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]] ; CLEANUP-NEXT: unreachable ; ; diff --git a/llvmraytracing/test/dx/continuation-without-await.ll b/llvmraytracing/test/dx/continuation-without-await.ll index bfbbbe1a9b..d0f0d6155b 100644 --- a/llvmraytracing/test/dx/continuation-without-await.ll +++ b/llvmraytracing/test/dx/continuation-without-await.ll @@ -153,7 +153,10 @@ attributes #2 = { nounwind } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP6]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP18]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] +; LOWERRAYTRACINGPIPELINE: .split: +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main_no_call( @@ -162,7 +165,8 @@ attributes #2 = { nounwind } ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @called( @@ -197,7 +201,7 @@ attributes #2 = { nounwind } ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP26]], ptr [[TMP24]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [3 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [8 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META17]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], [8 x i32] poison, [3 x i32] [[TMP28]]), !continuation.registercount [[META17]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -253,7 +257,7 @@ attributes #2 = { nounwind } ; CLEANUP-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; CLEANUP-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; CLEANUP-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 -; CLEANUP-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [8 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META17]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT5]], [8 x i32] poison, [3 x i32] [[DOTFCA_2_INSERT]]), !continuation.registercount [[META17]] ; CLEANUP-NEXT: unreachable ; ; diff --git a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll index 9b0c645bf1..63527fa776 100644 --- a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll +++ b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll @@ -46,7 +46,7 @@ define void @Intersection() #0 { ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.rt.instance.id() ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @lgc.rt.hit.kind() ; CHECK-NEXT: [[TMP5:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = call i1 (...) @lgc.rt.report.hit(float 4.000000e+00, i32 0, ptr [[TMP5]], i32 8), !cont.payload.type [[META25:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 (...) @lgc.rt.report.hit(float 4.000000e+00, i32 0, ptr [[TMP5]], i32 8), !cont.payload.type [[META20:![0-9]+]] ; CHECK-NEXT: ret void ; %1 = call float @dx.op.rayTMin.f32(i32 153) ; RayTMin() @@ -64,7 +64,7 @@ define void @main() { ; CHECK-NEXT: [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @lgc.rt.shader.index() ; CHECK-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[TMP1]]) -; CHECK-NEXT: call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META26:![0-9]+]] +; CHECK-NEXT: call void (...) @lgc.rt.call.callable.shader(i32 1, ptr [[PARAMS]], i32 256), !cont.payload.type [[META18:![0-9]+]] ; CHECK-NEXT: ret void ; %params = alloca %struct.TheirParams, align 4 @@ -81,10 +81,10 @@ define void @mainTrace() { ; CHECK-NEXT: [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 ; CHECK-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; CHECK-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CHECK-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; CHECK-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP7]]) -; CHECK-NEXT: call void (...) @lgc.rt.trace.ray(i64 [[TMP8]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> , float 1.000000e+04, ptr [[TMP4]], [1 x i32] [i32 272]), !cont.payload.type [[META27:![0-9]+]] +; CHECK-NEXT: call void (...) @lgc.rt.trace.ray(i64 [[TMP8]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> , float 1.000000e+04, ptr [[TMP4]], [1 x i32] [i32 272]), !cont.payload.type [[META17:![0-9]+]] ; CHECK-NEXT: ret void ; %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 @@ -99,11 +99,11 @@ define void @mainTrace() { define void @called(%struct.MyParams* %arg) !pointeetys !38 { ; CHECK-LABEL: define void @called( -; CHECK-SAME: ptr [[ARG:%.*]]) !pointeetys [[META28:![0-9]+]] !lgc.rt.shaderstage [[META30:![0-9]+]] !cont.payload.type [[META31:![0-9]+]] { +; CHECK-SAME: ptr [[ARG:%.*]]) !pointeetys [[META21:![0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !cont.payload.type [[META21]] { ; CHECK-NEXT: [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS2:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @lgc.rt.shader.index() ; CHECK-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[TMP1]]) -; CHECK-NEXT: call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS]], i32 260), !cont.payload.type [[META32:![0-9]+]] +; CHECK-NEXT: call void (...) @lgc.rt.call.callable.shader(i32 2, ptr [[PARAMS]], i32 260), !cont.payload.type [[META19:![0-9]+]] ; CHECK-NEXT: ret void ; %params = alloca %struct.TheirParams2, align 4 diff --git a/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll b/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll index 7082c1bd07..aac3488367 100644 --- a/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll +++ b/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll @@ -39,9 +39,6 @@ declare !pointeetys !3 void @"\01?_AmdAwait@@YA?AUDispatchSystemData@@UTraversal ; Function Attrs: nounwind declare i64 @_AmdGetResumePointAddr() #3 -; Function Attrs: nounwind -declare void @_AmdComplete() #3 - ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) declare !pointeetys !5 void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4 @@ -81,7 +78,7 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 2 ; CHECK-NEXT: store i64 [[ADDR]], ptr [[A]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[_AMDAWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[_AMDAWAIT:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]]) ; CHECK-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP2]], i32 0, i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 diff --git a/llvmraytracing/test/dx/dxil-cont-post-process.ll b/llvmraytracing/test/dx/dxil-cont-post-process.ll index dc309866fd..56d7ee3944 100644 --- a/llvmraytracing/test/dx/dxil-cont-post-process.ll +++ b/llvmraytracing/test/dx/dxil-cont-post-process.ll @@ -7,7 +7,6 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: @debug_global = external global i1 -declare void @_AmdComplete() #0 declare i32 @_cont_GetContinuationStackAddr() declare i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) diff --git a/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll b/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll index 7e2aea5fcc..651080ad60 100644 --- a/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll +++ b/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll @@ -98,11 +98,11 @@ attributes #2 = { nounwind } ; PREPARE-NEXT: [[ADDR:%.*]] = zext i32 [[A4]] to i64 ; PREPARE-NEXT: [[TMP7:%.*]] = load [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP5]], align 4 ; PREPARE-NEXT: [[TMP10:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal) -; PREPARE-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 [[ADDR]], i64 -1, i32 poison, i64 [[TMP10]], [[STRUCT_SYSTEMDATA]] [[TMP7]]) +; PREPARE-NEXT: call void (...) @lgc.cps.jump(i64 [[ADDR]], i32 -1, {} poison, i64 [[TMP10]], [[STRUCT_SYSTEMDATA]] [[TMP7]]), !waitmask [[META1:![0-9]+]] ; PREPARE-NEXT: unreachable ; PREPARE: 9: ; PREPARE-NEXT: [[TMP9:%.*]] = load [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], align 4 -; PREPARE-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 0, i64 -1, i32 poison, i64 2, [[STRUCT_SYSTEMDATA]] [[TMP9]]) +; PREPARE-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 2, [[STRUCT_SYSTEMDATA]] [[TMP9]]), !waitmask [[META1]] ; PREPARE-NEXT: unreachable ; ; @@ -177,9 +177,6 @@ attributes #2 = { nounwind } ; ALL-NEXT: [[DOTFCA_1_LOAD:%.*]] = load float, ptr [[DOTFCA_1_GEP]], align 4 ; ALL-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT]], float [[DOTFCA_1_LOAD]], 1 ; ALL-NEXT: [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @_cont_Traversal) -; ALL-NEXT: [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; ALL-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; ALL-NEXT: [[DOTFCA_1_INSERT125:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1 ; ALL-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 1, 0 ; ALL-NEXT: [[DOTFCA_1_INSERT1:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; ALL-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT1]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 @@ -220,9 +217,6 @@ attributes #2 = { nounwind } ; ALL-NEXT: [[DOTFCA_1_GEP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], i32 0, i32 1 ; ALL-NEXT: [[DOTFCA_1_LOAD5:%.*]] = load float, ptr [[DOTFCA_1_GEP4]], align 4 ; ALL-NEXT: [[DOTFCA_1_INSERT6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT3]], float [[DOTFCA_1_LOAD5]], 1 -; ALL-NEXT: [[DOTFCA_0_0_0_INSERT128:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; ALL-NEXT: [[DOTFCA_0_1_INSERT131:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT128]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; ALL-NEXT: [[DOTFCA_1_INSERT134:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT131]], i32 [[DOTFCA_1_EXTRACT]], 1 ; ALL-NEXT: [[DOTFCA_0_INSERT3:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; ALL-NEXT: [[DOTFCA_1_INSERT7:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT3]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; ALL-NEXT: [[DOTFCA_2_INSERT9:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT7]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 diff --git a/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll b/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll index 90442d10df..8c5d53e7fa 100644 --- a/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll +++ b/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll @@ -173,7 +173,7 @@ attributes #6 = { nounwind willreturn memory(inaccessiblemem: read) } ; CPS-STACK-LOWERING-CPS-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP3]] ; CPS-STACK-LOWERING-CPS-NEXT: store i32 [[RETURN_ADDR]], ptr addrspace(22) [[TMP5]], align 4 ; CPS-STACK-LOWERING-CPS-NEXT: store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CPS-STACK-LOWERING-CPS-NEXT: [[TMP6:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[CONTINUATIONS_GETSYSTEMDATA_S_STRUCT_DISPATCHSYSTEMDATAS:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]() +; CPS-STACK-LOWERING-CPS-NEXT: [[TMP6:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[CONTINUATIONS_GETSYSTEMDATA_S_STRUCT_DISPATCHSYSTEMDATAS:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]() ; CPS-STACK-LOWERING-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0 ; CPS-STACK-LOWERING-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CPS-STACK-LOWERING-CPS-NEXT: [[TMP7:%.*]] = add i32 [[TMP3]], 9 diff --git a/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll b/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll index ecafd30575..2c3c98acf8 100644 --- a/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll +++ b/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll @@ -170,7 +170,7 @@ attributes #6 = { nounwind willreturn memory(inaccessiblemem: read) } ; CPS-STACK-LOWERING-CPS-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0 ; CPS-STACK-LOWERING-CPS-NEXT: store i32 [[RETURN_ADDR]], ptr addrspace(21) [[TMP4]], align 4 ; CPS-STACK-LOWERING-CPS-NEXT: store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CPS-STACK-LOWERING-CPS-NEXT: [[TMP5:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[CONTINUATIONS_GETSYSTEMDATA_S_STRUCT_DISPATCHSYSTEMDATAS:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]() +; CPS-STACK-LOWERING-CPS-NEXT: [[TMP5:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @[[CONTINUATIONS_GETSYSTEMDATA_S_STRUCT_DISPATCHSYSTEMDATAS:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]() ; CPS-STACK-LOWERING-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], 0 ; CPS-STACK-LOWERING-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CPS-STACK-LOWERING-CPS-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], 9 diff --git a/llvmraytracing/test/dx/inline-const-jump-target.ll b/llvmraytracing/test/dx/inline-const-jump-target.ll index 90184e8701..3cb73b3dac 100644 --- a/llvmraytracing/test/dx/inline-const-jump-target.ll +++ b/llvmraytracing/test/dx/inline-const-jump-target.ll @@ -91,7 +91,8 @@ define void @main() { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[TMP2]], i32 2, {} poison, i32 [[RET_ADDR_I]], i32 999, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], {} poison, [0 x i32] poison, [0 x i32] poison) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE-CPS: _cont_CallShader.exit: -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; JUMP-INLINER-CPS-LABEL: define void @main( ; JUMP-INLINER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META15:![0-9]+]] !continuation [[META16:![0-9]+]] { @@ -112,7 +113,8 @@ define void @main() { ; JUMP-INLINER-CPS: Callable.exit: ; JUMP-INLINER-CPS-NEXT: unreachable ; JUMP-INLINER-CPS: _cont_CallShader.exit: -; JUMP-INLINER-CPS-NEXT: ret void +; JUMP-INLINER-CPS-NEXT: call void @lgc.cps.complete() +; JUMP-INLINER-CPS-NEXT: unreachable ; %params = alloca %struct.TheirParams, align 4 call void @dx.op.callShader.struct.TheirParams(i32 159, i32 1, %struct.TheirParams* nonnull %params) diff --git a/llvmraytracing/test/dx/intersection-registercount.ll b/llvmraytracing/test/dx/intersection-registercount.ll index b95b9c5e00..d0e8edd950 100644 --- a/llvmraytracing/test/dx/intersection-registercount.ll +++ b/llvmraytracing/test/dx/intersection-registercount.ll @@ -1,6 +1,6 @@ -; RUN: opt --verify-each --report-payload-register-sizes -passes='dxil-cont-intrinsic-prepare,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,continuations-stats-report,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s +; RUN: opt --verify-each --report-payload-register-sizes=max -passes='dxil-cont-intrinsic-prepare,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,continuations-stats-report,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s -; CHECK: Incoming and max outgoing payload VGPR size of "Intersection" (intersection): 100 and 100 bytes +; CHECK: Incoming and max outgoing payload VGPR size of "Intersection" (intersection): 25 and 25 dwords target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32" diff --git a/llvmraytracing/test/dx/intrinsics/complete.ll b/llvmraytracing/test/dx/intrinsics/complete.ll new file mode 100644 index 0000000000..38b7c78b8a --- /dev/null +++ b/llvmraytracing/test/dx/intrinsics/complete.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=LOWERRAYTRACINGPIPELINE %s +; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CLEANUP %s + +%struct.DispatchSystemData = type { i32 } +%struct.TraversalData = type { i32 } + +@debug_global = external global i32 +declare i32 @Val(i32) +declare void @_AmdComplete() +declare !pointeetys !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) +declare !pointeetys !3 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind) + +define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage !0 { +; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @_cont_Traversal( +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-NEXT: AllocaSpillBB: +; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_TRAVERSALDATA]], align 8 +; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_TRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[VAL:%.*]] = call i32 @Val(i32 5) +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[VAL]], ptr @debug_global, align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable +; +; CLEANUP-LABEL: define void @_cont_Traversal( +; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.state [[META1:![0-9]+]] { +; CLEANUP-NEXT: AllocaSpillBB: +; CLEANUP-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0 +; CLEANUP-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1 +; CLEANUP-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2 +; CLEANUP-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3 +; CLEANUP-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4 +; CLEANUP-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5 +; CLEANUP-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6 +; CLEANUP-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7 +; CLEANUP-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8 +; CLEANUP-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9 +; CLEANUP-NEXT: [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10 +; CLEANUP-NEXT: [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11 +; CLEANUP-NEXT: [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12 +; CLEANUP-NEXT: [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13 +; CLEANUP-NEXT: [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14 +; CLEANUP-NEXT: [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15 +; CLEANUP-NEXT: [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16 +; CLEANUP-NEXT: [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17 +; CLEANUP-NEXT: [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18 +; CLEANUP-NEXT: [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19 +; CLEANUP-NEXT: [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20 +; CLEANUP-NEXT: [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21 +; CLEANUP-NEXT: [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22 +; CLEANUP-NEXT: [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23 +; CLEANUP-NEXT: [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24 +; CLEANUP-NEXT: [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25 +; CLEANUP-NEXT: [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26 +; CLEANUP-NEXT: [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27 +; CLEANUP-NEXT: [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28 +; CLEANUP-NEXT: [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29 +; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0 +; CLEANUP-NEXT: [[VAL:%.*]] = call i32 @Val(i32 5) +; CLEANUP-NEXT: ret void +; CLEANUP: AllocaSpillBB.split: +; CLEANUP-NEXT: unreachable +; +AllocaSpillBB: + %val = call i32 @Val(i32 5) + call void @_AmdComplete() + store i32 %val, i32* @debug_global, align 4 + unreachable +} + +!0 = !{i32 6} +!2 = !{%struct.DispatchSystemData poison} +!3 = !{%struct.TraversalData poison} diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll index fff640b854..79ff0e2e14 100644 --- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll +++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll @@ -25,7 +25,7 @@ define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwin ret void } -declare void @lgc.ilcps.waitContinue(...) +declare void @lgc.cps.jump(...) define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage !3 { ; ALL-LABEL: define void @_cont_Traversal( @@ -39,7 +39,6 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage ; ALL-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; ALL-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0 ; ALL-NEXT: store i32 [[PAYLOAD_FCA_5_EXTRACT]], ptr @debug_global, align 4 -; ALL-NEXT: [[DOTFCA_0_INSERT3:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0 ; ALL-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; ALL-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; ALL-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 2 @@ -58,20 +57,20 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP1:%.*]] = getelementptr [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP2]], ptr @debug_global, align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP4]]), !continuation.registercount [[META0]], !waitmask [[META4:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP3]]), !waitmask [[META4:![0-9]+]], !continuation.registercount [[META0]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; entry: %val = call i32 @_AmdContPayloadRegistersGetI32(i32 2) store i32 %val, i32* @debug_global, align 4 - call void (...) @lgc.ilcps.waitContinue(i64 0, i64 -1, i32 2, i64 poison, %struct.SystemData poison) + call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData poison), !waitmask !2 unreachable } !continuation.maxPayloadRegisterCount = !{!18} +!2 = !{i32 -1} !3 = !{i32 6} !4 = !{i32 8, i32 12, i32 6, i32 16, i32 7, i32 8, i32 5, !5} !5 = !{i32 0} diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll index 09fb6b4991..03fef54610 100644 --- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll +++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll @@ -37,7 +37,8 @@ define void @main() { ; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT: store i32 11, ptr @debug_global, align 4 -; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT: unreachable ; entry: %val = call i32 @_AmdContPayloadRegistersI32Count() diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll index 79e350a945..e570e21080 100644 --- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll +++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll @@ -21,7 +21,7 @@ declare !pointeetys !17 i1 @_cont_ReportHit(%struct.TraversalData* %data, float !continuation.maxPayloadRegisterCount = !{!18} -declare void @lgc.ilcps.waitContinue(...) +declare void @lgc.cps.jump(...) define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage !3 { ; ALL-LABEL: define void @_cont_Traversal( @@ -34,7 +34,6 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage ; ALL-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 2 ; ALL-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[PAYLOAD]], 3 ; ALL-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0 -; ALL-NEXT: [[DOTFCA_0_INSERT2:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0 ; ALL-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; ALL-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; ALL-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 @@ -52,17 +51,17 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_TRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP1:%.*]] = getelementptr [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 3 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 42, ptr [[TMP1]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP3]]), !continuation.registercount [[META0]], !waitmask [[META4:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA:%.*]] poison, [8 x i32] poison, [4 x i32] [[TMP2]]), !waitmask [[META4:![0-9]+]], !continuation.registercount [[META0]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; entry: call void @_AmdContPayloadRegistersSetI32(i32 3, i32 42) - call void (...) @lgc.ilcps.waitContinue(i64 0, i64 -1, i32 2, i64 poison, %struct.SystemData poison) + call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData poison), !waitmask !2 unreachable } +!2 = !{i32 -1} !3 = !{i32 6} !4 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !5} !5 = !{i32 0} diff --git a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll b/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll index 90bcb8777e..0e87ed5411 100644 --- a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll +++ b/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll @@ -18,7 +18,7 @@ define void @MyRayGen() { ; CHECK-CPS-LABEL: define void @MyRayGen() { ; CHECK-CPS-NEXT: AllocaSpillBB: ; CHECK-CPS-NEXT: [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @MyRayGen) -; CHECK-CPS-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0:%.*]] to i64 +; CHECK-CPS-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-CPS-NEXT: call void @Use(i64 [[TMP1]]) ; CHECK-CPS-NEXT: ret void ; @@ -38,7 +38,7 @@ define void @MyRayGen.resume.0() { ; CHECK-CPS-LABEL: define void @MyRayGen.resume.0() { ; CHECK-CPS-NEXT: entryresume.0: ; CHECK-CPS-NEXT: [[TMP0:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @MyRayGen.resume.0) -; CHECK-CPS-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0:%.*]] to i64 +; CHECK-CPS-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-CPS-NEXT: call void @Use(i64 [[TMP1]]) ; CHECK-CPS-NEXT: ret void ; diff --git a/llvmraytracing/test/dx/intrinsics/get-rtip.ll b/llvmraytracing/test/dx/intrinsics/get-rtip.ll index 7f87f75348..b3688329c8 100644 --- a/llvmraytracing/test/dx/intrinsics/get-rtip.ll +++ b/llvmraytracing/test/dx/intrinsics/get-rtip.ll @@ -10,7 +10,7 @@ declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) define void @main() !lgc.rt.shaderstage !1 { ; CHECK-LABEL: define void @main( -; CHECK-SAME: ) !lgc.rt.shaderstage [[META3:![0-9]+]] { +; CHECK-SAME: ) !lgc.rt.shaderstage [[META2:![0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: store i32 2, ptr @debug_global, align 4 ; CHECK-NEXT: ret void diff --git a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll b/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll index ed2e841e3d..601d72b158 100644 --- a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll +++ b/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll @@ -29,7 +29,7 @@ define float @_cont_RayTCurrent() { ; Note: DXILShaderKind::Miss has value 11 define void @MyMiss(%struct.Payload* %payload) !pointeetys !1 !lgc.rt.shaderstage !16 { ; CHECK-LABEL: define %struct.DispatchSystemData @MyMiss -; CHECK-SAME: (i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META12:![0-9]+]] !continuation.registercount [[META5:![0-9]+]] !continuation [[META13:![0-9]+]] { +; CHECK-SAME: (i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META9:![0-9]+]] !continuation.registercount [[META5:![0-9]+]] !continuation [[META10:![0-9]+]] { ; CHECK-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; CHECK-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [7 x i32], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8 @@ -47,7 +47,7 @@ define void @MyMiss(%struct.Payload* %payload) !pointeetys !1 !lgc.rt.shaderstag ; CHECK-NEXT: store i32 [[TMP8]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], [8 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META5]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], [8 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META5]] ; CHECK-NEXT: unreachable ; %1 = call i32 @_AmdGetShaderKind() diff --git a/llvmraytracing/test/dx/intrinsics/is-llpc.ll b/llvmraytracing/test/dx/intrinsics/is-llpc.ll new file mode 100644 index 0000000000..57612b938e --- /dev/null +++ b/llvmraytracing/test/dx/intrinsics/is-llpc.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s + +declare i1 @_AmdIsLlpc() + +%struct.DispatchSystemData = type { i32 } +declare !pointeetys !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) + +@debug_global = external global i32 + +define void @main() !lgc.rt.shaderstage !1 { +; CHECK-LABEL: define void @main( +; CHECK-SAME: ) !lgc.rt.shaderstage [[META1:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store i1 false, ptr @debug_global, align 1 +; CHECK-NEXT: ret void +; +entry: + %val = call i1 @_AmdIsLlpc() + store i1 %val, ptr @debug_global + ret void +} + +!0 = !{i32 2} +!1 = !{i32 0} +!8 = !{%struct.DispatchSystemData poison} diff --git a/llvmraytracing/test/dx/intrinsics/shader-index.ll b/llvmraytracing/test/dx/intrinsics/shader-index.ll index b00d06be1e..c69eff2f46 100644 --- a/llvmraytracing/test/dx/intrinsics/shader-index.ll +++ b/llvmraytracing/test/dx/intrinsics/shader-index.ll @@ -20,14 +20,15 @@ define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwin define void @main() !lgc.rt.shaderstage !24 { ; CHECK-LABEL: define void @main( -; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META13:![0-9]+]] !lgc.cps [[META10:![0-9]+]] !continuation [[META14:![0-9]+]] { +; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META12:![0-9]+]] !lgc.cps [[META10:![0-9]+]] !continuation [[META13:![0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; CHECK-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4 ; CHECK-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CHECK-NEXT: store i32 0, ptr @debug_global, align 4 -; CHECK-NEXT: ret void +; CHECK-NEXT: call void @lgc.cps.complete() +; CHECK-NEXT: unreachable ; entry: %val = call i32 @lgc.rt.shader.index() @@ -37,7 +38,7 @@ entry: define void @callable(%struct.Payload* %payload) !pointeetys !22 !lgc.rt.shaderstage !25 { ; CHECK-LABEL: define void @callable( -; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META15:![0-9]+]] !lgc.cps [[META16:![0-9]+]] !continuation [[META17:![0-9]+]] { +; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META14:![0-9]+]] !lgc.cps [[META15:![0-9]+]] !continuation [[META16:![0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; CHECK-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [1 x i32], align 4 diff --git a/llvmraytracing/test/dx/intrinsics/value-i32.ll b/llvmraytracing/test/dx/intrinsics/value-i32.ll index 46b359aab0..d2361d76cd 100644 --- a/llvmraytracing/test/dx/intrinsics/value-i32.ll +++ b/llvmraytracing/test/dx/intrinsics/value-i32.ll @@ -31,7 +31,7 @@ define i32 @get(%struct.Payload* %pl) !pointeetys !0 { define void @set(%struct.Payload* %pl, i32 %val) !pointeetys !0 { ; CHECK-LABEL: define void @set -; CHECK-SAME: (ptr [[PL:%.*]], i32 [[VAL:%.*]]) !pointeetys [[META3:![0-9]+]] { +; CHECK-SAME: (ptr [[PL:%.*]], i32 [[VAL:%.*]]) !pointeetys [[META1]] { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PL]], i32 2 ; CHECK-NEXT: store i32 [[VAL]], ptr [[TMP1]], align 4 ; CHECK-NEXT: ret void diff --git a/llvmraytracing/test/dx/lint/undef-jump-target.ll b/llvmraytracing/test/dx/lint/undef-jump-target.ll index a3f9b2b829..3b0d90876c 100644 --- a/llvmraytracing/test/dx/lint/undef-jump-target.ll +++ b/llvmraytracing/test/dx/lint/undef-jump-target.ll @@ -6,10 +6,10 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: %struct.DispatchSystemData = type { i32 } -declare void @lgc.ilcps.continue(...) +declare void @lgc.cps.jump(...) define void @RayGen(i64 %dummyRetAddr, %struct.DispatchSystemData %0) !lgc.rt.shaderstage !0 !continuation.entry !1 !continuation !2 { - call void (...) @lgc.ilcps.continue(i64 undef, i32 undef, i64 undef), !continuation.registercount !0 + call void (...) @lgc.cps.jump(i64 undef), !continuation.registercount !0 unreachable } diff --git a/llvmraytracing/test/dx/lower-await.ll b/llvmraytracing/test/dx/lower-await.ll index 7df77fa724..1ac5f29e97 100644 --- a/llvmraytracing/test/dx/lower-await.ll +++ b/llvmraytracing/test/dx/lower-await.ll @@ -10,8 +10,10 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: declare void @await.void(%continuation.token*) declare i32 @await.i32(%continuation.token*) declare %continuation.token* @async_fun() -declare %continuation.token* @async_fun_with_waitmask(i64) +declare %continuation.token* @async_fun_with_waitmask() declare %continuation.token* @async_fun_with_arg(i32) +declare void @lgc.cps.jump(...) +declare void @lgc.cps.complete() define void @simple_await(i64 %dummyRetAddr) !continuation.registercount !1 { ; AWAIT-LABEL: define { ptr, ptr } @simple_await( @@ -20,14 +22,14 @@ define void @simple_await(i64 %dummyRetAddr) !continuation.registercount !1 { ; AWAIT-NEXT: [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null) ; AWAIT-NEXT: [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; AWAIT-NEXT: [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]]) -; AWAIT-NEXT: call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]]), !continuation.registercount [[META1]] +; AWAIT-NEXT: call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META1]] ; AWAIT-NEXT: unreachable ; ; CORO-LABEL: define { ptr, ptr } @simple_await( ; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] { ; CORO-NEXT: AllocaSpillBB: -; CORO-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0 -; CORO-NEXT: store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4 +; CORO-NEXT: [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0 +; CORO-NEXT: store i64 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4 ; CORO-NEXT: [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; CORO-NEXT: [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await.resume.0, 0 ; CORO-NEXT: [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1 @@ -37,15 +39,16 @@ define void @simple_await(i64 %dummyRetAddr) !continuation.registercount !1 { ; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] { ; CLEANED-NEXT: AllocaSpillBB: ; CLEANED-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) -; CLEANED-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 -; CLEANED-NEXT: store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4 +; CLEANED-NEXT: [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 +; CLEANED-NEXT: store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[DUMMYRETADDR_SPILL_ADDR]], align 4 ; CLEANED-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0) ; CLEANED-NEXT: call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; CLEANED-NEXT: unreachable ; %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) - ret void, !continuation.registercount !1 + call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i64 poison), !continuation.registercount !1 + unreachable } define void @simple_await_entry() !continuation.entry !0 !continuation.registercount !1 { @@ -55,7 +58,7 @@ define void @simple_await_entry() !continuation.entry !0 !continuation.registerc ; AWAIT-NEXT: [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null) ; AWAIT-NEXT: [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; AWAIT-NEXT: [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]]) -; AWAIT-NEXT: call void (...) @lgc.ilcps.return(i64 undef) +; AWAIT-NEXT: call void @lgc.cps.complete() ; AWAIT-NEXT: unreachable ; ; CORO-LABEL: define { ptr, ptr } @simple_await_entry( @@ -76,7 +79,8 @@ define void @simple_await_entry() !continuation.entry !0 !continuation.registerc %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) ; Note: entry functions don't need a registercount annotation on return - ret void + call void @lgc.cps.complete() + unreachable } define void @await_with_arg(i64 %dummyRetAddr, i32 %i) !continuation.registercount !1 { @@ -86,14 +90,14 @@ define void @await_with_arg(i64 %dummyRetAddr, i32 %i) !continuation.registercou ; AWAIT-NEXT: [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null) ; AWAIT-NEXT: [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; AWAIT-NEXT: [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]]) -; AWAIT-NEXT: call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]]), !continuation.registercount [[META1]] +; AWAIT-NEXT: call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i64 poison), !continuation.registercount [[META1]] ; AWAIT-NEXT: unreachable ; ; CORO-LABEL: define { ptr, ptr } @await_with_arg( ; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] { ; CORO-NEXT: AllocaSpillBB: -; CORO-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0 -; CORO-NEXT: store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4 +; CORO-NEXT: [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0 +; CORO-NEXT: store i64 [[DUMMYRETADDR]], ptr [[DUMMYRETADDR_SPILL_ADDR]], align 4 ; CORO-NEXT: [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; CORO-NEXT: [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_arg.resume.0, 0 ; CORO-NEXT: [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1 @@ -103,15 +107,16 @@ define void @await_with_arg(i64 %dummyRetAddr, i32 %i) !continuation.registercou ; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] { ; CLEANED-NEXT: AllocaSpillBB: ; CLEANED-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) -; CLEANED-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 -; CLEANED-NEXT: store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4 +; CLEANED-NEXT: [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 +; CLEANED-NEXT: store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[DUMMYRETADDR_SPILL_ADDR]], align 4 ; CLEANED-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_arg.resume.0) ; CLEANED-NEXT: call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun_with_arg to i64), i32 -1, {} poison, i64 [[TMP0]], i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; CLEANED-NEXT: unreachable ; %tok = call %continuation.token* @async_fun_with_arg(i32 %i), !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) - ret void, !continuation.registercount !1 + call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i64 poison), !continuation.registercount !1 + unreachable } define i32 @await_with_ret_value(i64 %dummyRetAddr) !continuation.registercount !1 { @@ -122,7 +127,7 @@ define i32 @await_with_ret_value(i64 %dummyRetAddr) !continuation.registercount ; AWAIT-NEXT: [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]] ; AWAIT-NEXT: [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]]) ; AWAIT-NEXT: [[TMP5:%.*]] = call i32 @lgc.ilcps.getReturnValue__i32() -; AWAIT-NEXT: call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]], i32 [[TMP5]]), !continuation.registercount [[META1]] +; AWAIT-NEXT: call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, {} poison, i64 poison, i32 [[TMP5]]), !continuation.registercount [[META1]] ; AWAIT-NEXT: unreachable ; ; CORO-LABEL: define { ptr, ptr } @await_with_ret_value( @@ -147,7 +152,8 @@ define i32 @await_with_ret_value(i64 %dummyRetAddr) !continuation.registercount ; %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1 %res = call i32 @await.i32(%continuation.token* %tok) - ret i32 %res, !continuation.registercount !1 + call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, {} poison, i64 poison, i32 %res), !continuation.registercount !1 + unreachable } define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 { @@ -155,9 +161,9 @@ define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 { ; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] { ; AWAIT-NEXT: [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.wait_await, ptr @continuation.malloc, ptr @continuation.free) ; AWAIT-NEXT: [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null) -; AWAIT-NEXT: [[TOK:%.*]] = call ptr @async_fun_with_waitmask(i64 -1), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !continuation.wait.await [[META3]] +; AWAIT-NEXT: [[TOK:%.*]] = call ptr @async_fun_with_waitmask(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]] ; AWAIT-NEXT: [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]]) -; AWAIT-NEXT: call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]]), !continuation.registercount [[META1]] +; AWAIT-NEXT: call void (...) @lgc.cps.jump(i64 [[DUMMYRETADDR]], i32 -1, i64 poison, i64 poison), !continuation.registercount [[META1]] ; AWAIT-NEXT: unreachable ; ; CORO-LABEL: define { ptr, ptr } @wait_await( @@ -165,7 +171,7 @@ define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 { ; CORO-NEXT: AllocaSpillBB: ; CORO-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0 ; CORO-NEXT: store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4 -; CORO-NEXT: [[TOK:%.*]] = call ptr @async_fun_with_waitmask(i64 -1), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !continuation.wait.await [[META3]] +; CORO-NEXT: [[TOK:%.*]] = call ptr @async_fun_with_waitmask(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META8:![0-9]+]] ; CORO-NEXT: [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @wait_await.resume.0, 0 ; CORO-NEXT: [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1 ; CORO-NEXT: ret { ptr, ptr } [[TMP2]] @@ -174,15 +180,16 @@ define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 { ; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META8:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] { ; CLEANED-NEXT: AllocaSpillBB: ; CLEANED-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) -; CLEANED-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 -; CLEANED-NEXT: store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4 +; CLEANED-NEXT: [[DUMMYRETADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 +; CLEANED-NEXT: store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[DUMMYRETADDR_SPILL_ADDR]], align 4 ; CLEANED-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @wait_await.resume.0) ; CLEANED-NEXT: call void (...) @lgc.cps.jump(i64 ptrtoint (ptr @async_fun_with_waitmask to i64), i32 -1, {} poison, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !waitmask [[META9:![0-9]+]] ; CLEANED-NEXT: unreachable ; - %tok = call %continuation.token* @async_fun_with_waitmask(i64 -1), !continuation.wait.await !0, !continuation.registercount !1, !continuation.returnedRegistercount !1 + %tok = call %continuation.token* @async_fun_with_waitmask(), !waitmask !3, !continuation.registercount !1, !continuation.returnedRegistercount !1 call void @await.void(%continuation.token* %tok) - ret void, !continuation.registercount !1 + call void (...) @lgc.cps.jump(i64 %dummyRetAddr, i32 -1, i64 poison, i64 poison), !continuation.registercount !1 + unreachable } !continuation.stackAddrspace = !{!2} @@ -190,3 +197,4 @@ define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 { !0 = !{} !1 = !{i32 0} !2 = !{i32 21} +!3 = !{i32 -1} diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll index a66664ddbf..6c498ff781 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll @@ -113,7 +113,10 @@ attributes #0 = { nounwind } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP9]], 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: br label [[DOTSPLIT:%.*]] +; LOWERRAYTRACINGPIPELINE: .split: +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define i32 @_cont_GetLocalRootIndex( @@ -143,7 +146,10 @@ attributes #0 = { nounwind } ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP5]], 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: br label [[DOTSPLIT:%.*]] +; LOWERRAYTRACINGPIPELINE-CPS: .split: +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; POSTPROCESS-CPS-LABEL: define i32 @_cont_GetLocalRootIndex( diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll b/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll index b330debb16..f441c64dd2 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll @@ -88,5 +88,6 @@ attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="fa ; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.ilcps.waitContinue(i64 [[ADDR_I]], i64 -1, [[STRUCT_SYSTEMDATA]] [[SYSTEMDATA_I]]) #[[ATTR3:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: _cont_ExitRayGen.exit: -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll index 7600b8aaa3..57527032aa 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll @@ -186,7 +186,8 @@ define void @RayGen() #3 { ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @RayGen( ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META28:![0-9]+]] !continuation.entry [[META13:![0-9]+]] { @@ -245,12 +246,12 @@ define void @Intersection() #3 { ; LOWERRAYTRACINGPIPELINE: 19: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [8 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META25]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], [8 x i32] poison, [30 x i32] [[TMP21]]), !continuation.registercount [[META25]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 22: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META25]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META25]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @Intersection( @@ -410,7 +411,7 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP22]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP20]], ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP7]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP7]], align 4 @@ -474,7 +475,7 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP57]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP56]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP58:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [8 x i32] poison, [10 x i32] [[TMP73]]), !continuation.registercount [[META26]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP58]], [8 x i32] poison, [10 x i32] [[TMP73]]), !continuation.registercount [[META26]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @AnyHit( @@ -525,7 +526,7 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil ; DXILCONTPOSTPROCESS-NEXT: [[TMP4:%.*]] = bitcast i32 [[PAYLOAD_FCA_0_EXTRACT]] to float ; DXILCONTPOSTPROCESS-NEXT: [[TMP5:%.*]] = bitcast i32 [[PAYLOAD_FCA_7_EXTRACT]] to float ; DXILCONTPOSTPROCESS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP7:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP6]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP7:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP6]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT22:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTSROA_023_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT22]], i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP8:%.*]] = bitcast float [[DOTSROA_023_0_VEC_EXTRACT]] to i32 @@ -680,7 +681,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP19]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP22]], ptr [[TMP14]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP5]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP24]], ptr [[HITATTRS]], align 4 @@ -692,14 +693,14 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = call float @_cont_RayTMin(ptr [[TMP39]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP42]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[RES_I:%.*]] = load float, ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP44]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = call i32 @_cont_InstanceID(ptr [[TMP28]], ptr [[TMP2]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP46]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[RESPTR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[RES_I1:%.*]] = load i32, ptr [[RESPTR_I]], align 4 @@ -721,7 +722,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP51]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [20 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META26]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP52]], [20 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META26]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit( @@ -748,7 +749,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[DOTFCA_1_EXTRACT]], ptr [[DOTFCA_1_GEP]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP3:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP3:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP3]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTSROA_08_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP4:%.*]] = bitcast float [[DOTSROA_08_0_VEC_EXTRACT]] to i32 @@ -758,11 +759,11 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP7:%.*]] = call float @_cont_RayTMin(ptr [[TMP6]]) ; DXILCONTPOSTPROCESS-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP9:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP9:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT9:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP9]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_EXTRACT11:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP9]], 1 ; DXILCONTPOSTPROCESS-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP11:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP11:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP11]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP21:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP1]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT20]], ptr [[DOTFCA_0_GEP21]], align 4 @@ -770,7 +771,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct. ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_GEP23:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP1]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[DOTFCA_1_EXTRACT22]], ptr [[DOTFCA_1_GEP23]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP12:%.*]] = call i32 @_cont_InstanceID(ptr [[TMP10]], ptr [[TMP1]]) -; DXILCONTPOSTPROCESS-NEXT: [[TMP13:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP13:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT15:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP13]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_1_EXTRACT17:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP13]], 1 ; DXILCONTPOSTPROCESS-NEXT: [[RESPTR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1 diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll index c7c23dc628..d4288c5e5f 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll @@ -130,7 +130,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP27]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP91]], ptr [[TMP22]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP31]], ptr [[TMP11]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP11]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP25]], ptr [[HITATTRS]], align 4 @@ -152,35 +152,35 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP54:%.*]] = call float @_cont_RayTMin(ptr [[TMP53]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP55:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP41]], ptr [[TMP8]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP57:%.*]] = call float @_cont_RayTCurrent(ptr [[TMP55]], ptr [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP59:%.*]] = call i32 @_cont_RayFlags(ptr [[TMP58]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP61:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP61:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP61]], ptr [[TMP4]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = call i32 @_cont_InstanceIndex(ptr [[TMP60]], ptr [[TMP4]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP64]], ptr [[TMP5]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = call i32 @_cont_InstanceID(ptr [[TMP63]], ptr [[TMP5]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP67]], ptr [[TMP6]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = call i32 @_cont_PrimitiveIndex(ptr [[TMP66]], ptr [[TMP6]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP70]], ptr [[TMP9]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP71:%.*]] = call <3 x float> @_cont_ObjectRayOrigin3(ptr [[TMP69]], ptr [[TMP9]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[K:%.*]] = extractelement <3 x float> [[TMP71]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP73]], ptr [[TMP10]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP74:%.*]] = call <3 x float> @_cont_ObjectRayDirection3(ptr [[TMP72]], ptr [[TMP10]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[L:%.*]] = extractelement <3 x float> [[TMP74]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP75:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP76:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP76:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP76]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP77:%.*]] = call [4 x <3 x float>] @_cont_ObjectToWorld4x3(ptr [[TMP75]], ptr [[TMP2]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x <3 x float>] [[TMP77]], ptr [[TMP13]], align 4 @@ -188,14 +188,14 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; LOWERRAYTRACINGPIPELINE-NEXT: [[COL_GEP_LOAD2:%.*]] = load <3 x float>, ptr [[COL_GEP1]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[M:%.*]] = extractelement <3 x float> [[COL_GEP_LOAD2]], i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP78:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP82:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP82:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP82]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP80:%.*]] = call [4 x <3 x float>] @_cont_WorldToObject4x3(ptr [[TMP78]], ptr [[TMP3]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [4 x <3 x float>] [[TMP80]], ptr [[TMP12]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[COL_GEP:%.*]] = getelementptr [4 x <3 x float>], ptr [[TMP12]], i32 0, i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[COL_GEP_LOAD:%.*]] = load <3 x float>, ptr [[COL_GEP]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[N:%.*]] = extractelement <3 x float> [[COL_GEP_LOAD]], i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP83:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP83:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_HITDATA]] [[TMP83]], ptr [[TMP7]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP84:%.*]] = call i32 @_cont_HitKind(ptr [[SYSTEM_DATA_ALLOCA]], ptr [[TMP7]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP85:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0 @@ -216,7 +216,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP97:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP98:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP97]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP81:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP98]], [17 x i32] poison, [10 x i32] [[TMP81]]), !continuation.registercount [[META20]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP98]], [17 x i32] poison, [10 x i32] [[TMP81]]), !continuation.registercount [[META20]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit( @@ -250,7 +250,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store <3 x i32> [[DOTFCA_0_0_EXTRACT]], ptr [[DOTFCA_0_0_GEP]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP11:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP11:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT19:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP11]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTSROA_021_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT19]], i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP12:%.*]] = bitcast float [[DOTSROA_021_0_VEC_EXTRACT]] to i32 @@ -272,7 +272,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP23:%.*]] = call float @_cont_RayTMin(ptr [[TMP22]]) ; DXILCONTPOSTPROCESS-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP25:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP25:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT29:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP25]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP30:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP7]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT29]], ptr [[DOTFCA_0_GEP30]], align 4 @@ -283,7 +283,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP28:%.*]] = call i32 @_cont_RayFlags(ptr [[TMP27]]) ; DXILCONTPOSTPROCESS-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP30:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP30:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT45:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP30]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP46:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP3]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT45]], ptr [[DOTFCA_0_GEP46]], align 4 @@ -292,7 +292,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: store i32 [[DOTFCA_1_EXTRACT47]], ptr [[DOTFCA_1_GEP48]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP31:%.*]] = call i32 @_cont_InstanceIndex(ptr [[TMP29]], ptr [[TMP3]]) ; DXILCONTPOSTPROCESS-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP33:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP33:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT41:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP33]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP42:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP4]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT41]], ptr [[DOTFCA_0_GEP42]], align 4 @@ -301,7 +301,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: store i32 [[DOTFCA_1_EXTRACT43]], ptr [[DOTFCA_1_GEP44]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP34:%.*]] = call i32 @_cont_InstanceID(ptr [[TMP32]], ptr [[TMP4]]) ; DXILCONTPOSTPROCESS-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP36:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP36:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT37:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP36]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP38:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP5]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT37]], ptr [[DOTFCA_0_GEP38]], align 4 @@ -310,7 +310,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: store i32 [[DOTFCA_1_EXTRACT39]], ptr [[DOTFCA_1_GEP40]], align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP37:%.*]] = call i32 @_cont_PrimitiveIndex(ptr [[TMP35]], ptr [[TMP5]]) ; DXILCONTPOSTPROCESS-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP39:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP39:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT25:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP39]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP26:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP8]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT25]], ptr [[DOTFCA_0_GEP26]], align 4 @@ -320,7 +320,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: [[TMP40:%.*]] = call <3 x float> @_cont_ObjectRayOrigin3(ptr [[TMP38]], ptr [[TMP8]]) ; DXILCONTPOSTPROCESS-NEXT: [[K:%.*]] = extractelement <3 x float> [[TMP40]], i8 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP42:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP42:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT22:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP42]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP23:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP9]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT22]], ptr [[DOTFCA_0_GEP23]], align 4 @@ -330,7 +330,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: [[TMP43:%.*]] = call <3 x float> @_cont_ObjectRayDirection3(ptr [[TMP41]], ptr [[TMP9]]) ; DXILCONTPOSTPROCESS-NEXT: [[L:%.*]] = extractelement <3 x float> [[TMP43]], i8 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP45:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP45:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT53:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP45]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP54:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP1]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT53]], ptr [[DOTFCA_0_GEP54]], align 4 @@ -344,7 +344,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [4 x <3 x float>] [[TMP46]], 3 ; DXILCONTPOSTPROCESS-NEXT: [[M:%.*]] = extractelement <3 x float> [[DOTFCA_0_EXTRACT]], i32 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP48:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP48:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT49:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP48]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP50:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP2]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT49]], ptr [[DOTFCA_0_GEP50]], align 4 @@ -357,7 +357,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_2_EXTRACT13:%.*]] = extractvalue [4 x <3 x float>] [[TMP49]], 2 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_3_EXTRACT14:%.*]] = extractvalue [4 x <3 x float>] [[TMP49]], 3 ; DXILCONTPOSTPROCESS-NEXT: [[N:%.*]] = extractelement <3 x float> [[DOTFCA_0_EXTRACT10]], i32 0 -; DXILCONTPOSTPROCESS-NEXT: [[TMP50:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP50:%.*]] = call [[STRUCT_HITDATA]] @[[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]]) ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT33:%.*]] = extractvalue [[STRUCT_HITDATA]] [[TMP50]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_GEP34:%.*]] = getelementptr inbounds [[STRUCT_HITDATA]], ptr [[TMP6]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-NEXT: store float [[DOTFCA_0_EXTRACT33]], ptr [[DOTFCA_0_GEP34]], align 4 diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll index b15886a69e..8d69eab346 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll @@ -179,8 +179,8 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_MEDIUMPAYLOAD]] zeroinitializer, ptr [[P2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_LARGEPAYLOAD]] zeroinitializer, ptr [[P3]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[T1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP7]], align 4 @@ -192,7 +192,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP9]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP10:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META17]], !continuation.wait.await [[META6:![0-9]+]], !continuation.returnedRegistercount [[META17]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP10]]), !continuation.registercount [[META17]], !waitmask [[META20:![0-9]+]], !continuation.returnedRegistercount [[META17]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } @await(ptr [[TMP19]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [1 x i32] } [[TMP25]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [1 x i32] [[TMP13]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -227,7 +227,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP24]], ptr addrspace(32) [[TMP22]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[TMP29]]), !continuation.registercount [[META13:![0-9]+]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[TMP29]]), !continuation.registercount [[META13:![0-9]+]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await.1(ptr [[TMP41]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP44]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [2 x i32] [[TMP60]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -280,7 +280,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP52]], ptr addrspace(32) [[TMP50]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP62:%.*]] = load [2 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[TMP62]]), !continuation.registercount [[META13]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[TMP62]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP64:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } @await.2(ptr [[TMP63]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP65:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP64]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [2 x i32] [[TMP65]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -317,7 +317,7 @@ attributes #3 = { nounwind memory(none) } ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP100:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP101:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP100]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP95:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP101]], [27 x i32] poison, [1 x i32] [[TMP95]]), !continuation.registercount [[META17]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP101]], [27 x i32] poison, [1 x i32] [[TMP95]]), !continuation.registercount [[META17]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -339,8 +339,8 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-NEXT: [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0 ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-NEXT: [[T1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; CLEANUP-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) -; CLEANUP-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) +; CLEANUP-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-NEXT: [[TMP1:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]]) ; CLEANUP-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0 ; CLEANUP-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -349,7 +349,7 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5 ; CLEANUP-NEXT: [[DOTFCA_0_INSERT15:%.*]] = insertvalue [1 x i32] poison, i32 0, 0 ; CLEANUP-NEXT: [[TMP3:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.0) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP3]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT15]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount [[META17]], !waitmask [[META21:![0-9]+]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP3]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT15]]), !continuation.registercount [[META17]], !waitmask [[META21:![0-9]+]], !continuation.returnedRegistercount [[META17]] ; CLEANUP-NEXT: unreachable ; ; @@ -364,8 +364,8 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT42:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], 0 ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-NEXT: [[T110:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; CLEANUP-NEXT: [[T29:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T110]]) -; CLEANUP-NEXT: [[T38:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T29]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-NEXT: [[T29:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[T110]]) +; CLEANUP-NEXT: [[T38:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[T29]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-NEXT: [[TMP3:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T38]]) ; CLEANUP-NEXT: [[DIS_DATA_I1_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT42]], 0 ; CLEANUP-NEXT: [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1_FCA_0_INSERT]], 0 @@ -380,7 +380,7 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-NEXT: [[DOTFCA_0_INSERT19:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP5]], 0 ; CLEANUP-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT19]], i32 0, 1 ; CLEANUP-NEXT: [[TMP8:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.1) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP8]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13:![0-9]+]], !continuation.returnedRegistercount [[META13]], !waitmask [[META21]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP8]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13:![0-9]+]], !waitmask [[META21]], !continuation.returnedRegistercount [[META13]] ; CLEANUP-NEXT: unreachable ; ; @@ -401,8 +401,8 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-NEXT: [[DOTFCA_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP8]], 0 ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-NEXT: [[T17:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; CLEANUP-NEXT: [[T26:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T17]]) -; CLEANUP-NEXT: [[T35:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-NEXT: [[T26:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[T17]]) +; CLEANUP-NEXT: [[T35:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-NEXT: [[TMP13:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T35]]) ; CLEANUP-NEXT: [[DIS_DATA_I5_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT12]], 0 ; CLEANUP-NEXT: [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5_FCA_0_INSERT]], 0 @@ -421,7 +421,7 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-NEXT: [[DOTFCA_0_INSERT25:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP14]], 0 ; CLEANUP-NEXT: [[DOTFCA_1_INSERT28:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT25]], i32 0, 1 ; CLEANUP-NEXT: [[TMP17:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.2) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP17]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT28]]), !continuation.registercount [[META13]], !continuation.returnedRegistercount [[META13]], !waitmask [[META21]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP17]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT28]]), !continuation.registercount [[META13]], !waitmask [[META21]], !continuation.returnedRegistercount [[META13]] ; CLEANUP-NEXT: unreachable ; ; @@ -452,7 +452,7 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-NEXT: [[DOTFCA_0_INSERT41:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT46]], 0 ; CLEANUP-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0 ; CLEANUP-NEXT: call void @lgc.cps.free(i32 28) -; CLEANUP-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT41]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT41]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]] ; CLEANUP-NEXT: unreachable ; ; @@ -474,8 +474,8 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) ; CLEANUP-CPS-NEXT: [[T1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; CLEANUP-CPS-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) -; CLEANUP-CPS-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-CPS-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) +; CLEANUP-CPS-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]]) ; CLEANUP-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0 ; CLEANUP-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -483,12 +483,12 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.0) ; CLEANUP-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP1]], 5 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT13:%.*]] = insertvalue [1 x i32] poison, i32 0, 0 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i64 -1, i64 [[TMP1]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT13]]), !continuation.registercount [[META19:![0-9]+]], !continuation.wait.await [[META6:![0-9]+]], !continuation.returnedRegistercount [[META19]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP1]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT13]]), !continuation.registercount [[META19:![0-9]+]], !waitmask [[META20:![0-9]+]], !continuation.returnedRegistercount [[META19]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.stacksize [[META18]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24) ; CLEANUP-CPS-NEXT: [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 @@ -498,8 +498,8 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT45:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-CPS-NEXT: [[T110:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; CLEANUP-CPS-NEXT: [[T29:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T110]]) -; CLEANUP-CPS-NEXT: [[T38:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T29]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-CPS-NEXT: [[T29:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[T110]]) +; CLEANUP-CPS-NEXT: [[T38:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[T29]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T38]]) ; CLEANUP-CPS-NEXT: [[DIS_DATA_I1_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT45]], 0 ; CLEANUP-CPS-NEXT: [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1_FCA_0_INSERT]], 0 @@ -513,12 +513,12 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-CPS-NEXT: store i32 0, ptr addrspace(32) [[TMP10]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT17:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP8]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT17]], i32 0, 1 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i64 -1, i64 [[TMP11]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP11]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.1( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.stacksize [[META18]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] { ; CLEANUP-CPS-NEXT: entryresume.1: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24) ; CLEANUP-CPS-NEXT: [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 @@ -534,8 +534,8 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT47:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-CPS-NEXT: [[T17:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; CLEANUP-CPS-NEXT: [[T26:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T17]]) -; CLEANUP-CPS-NEXT: [[T35:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-CPS-NEXT: [[T26:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[T17]]) +; CLEANUP-CPS-NEXT: [[T35:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T35]]) ; CLEANUP-CPS-NEXT: [[DIS_DATA_I5_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT47]], 0 ; CLEANUP-CPS-NEXT: [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5_FCA_0_INSERT]], 0 @@ -553,12 +553,12 @@ attributes #3 = { nounwind memory(none) } ; CLEANUP-CPS-NEXT: store i32 0, ptr addrspace(32) [[TMP17]], align 4 ; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT23:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP13]], 0 ; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT26:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT23]], i32 0, 1 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i64 -1, i64 [[TMP18]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT26]]), !continuation.registercount [[META13]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP18]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [10 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT26]]), !continuation.registercount [[META13]], !waitmask [[META20]], !continuation.returnedRegistercount [[META13]] ; CLEANUP-CPS-NEXT: unreachable ; ; ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.2( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.stacksize [[META18]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] { ; CLEANUP-CPS-NEXT: entryresume.2: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24) ; CLEANUP-CPS-NEXT: [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 @@ -613,8 +613,8 @@ attributes #3 = { nounwind memory(none) } ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[TMP0]], 0, 0 ; DXILCONTPOSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; DXILCONTPOSTPROCESS-NEXT: [[T1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; DXILCONTPOSTPROCESS-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) -; DXILCONTPOSTPROCESS-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; DXILCONTPOSTPROCESS-NEXT: [[T2:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]]) +; DXILCONTPOSTPROCESS-NEXT: [[T3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; DXILCONTPOSTPROCESS-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]]) ; DXILCONTPOSTPROCESS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -640,8 +640,8 @@ attributes #3 = { nounwind memory(none) } ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT42:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP14]], 0 ; DXILCONTPOSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; DXILCONTPOSTPROCESS-NEXT: [[T110:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; DXILCONTPOSTPROCESS-NEXT: [[T29:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T110]]) -; DXILCONTPOSTPROCESS-NEXT: [[T38:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T29]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; DXILCONTPOSTPROCESS-NEXT: [[T29:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[T110]]) +; DXILCONTPOSTPROCESS-NEXT: [[T38:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[T29]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; DXILCONTPOSTPROCESS-NEXT: [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T38]]) ; DXILCONTPOSTPROCESS-NEXT: [[DIS_DATA_I1_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT42]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1_FCA_0_INSERT]], 0 @@ -683,8 +683,8 @@ attributes #3 = { nounwind memory(none) } ; DXILCONTPOSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT44:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP13]], 0 ; DXILCONTPOSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; DXILCONTPOSTPROCESS-NEXT: [[T17:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; DXILCONTPOSTPROCESS-NEXT: [[T26:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T17]]) -; DXILCONTPOSTPROCESS-NEXT: [[T35:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; DXILCONTPOSTPROCESS-NEXT: [[T26:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[T17]]) +; DXILCONTPOSTPROCESS-NEXT: [[T35:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; DXILCONTPOSTPROCESS-NEXT: [[TMP12:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T35]]) ; DXILCONTPOSTPROCESS-NEXT: [[DIS_DATA_I5_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT44]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5_FCA_0_INSERT]], 0 diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll index 16e93c0d3d..95170965dc 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll @@ -149,7 +149,7 @@ attributes #1 = { alwaysinline } ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP22]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [16 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META14]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], [16 x i32] poison, [1 x i32] [[TMP20]]), !continuation.registercount [[META14]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -192,7 +192,7 @@ attributes #1 = { alwaysinline } ; CLEANUP-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0 ; CLEANUP-NEXT: [[DOTFCA_0_INSERT1:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0 ; CLEANUP-NEXT: call void @lgc.cps.free(i32 8) -; CLEANUP-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META14]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [16 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT1]]), !continuation.registercount [[META14]] ; CLEANUP-NEXT: unreachable ; ; diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll index c4d6f2a287..3bfff91b94 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll @@ -176,7 +176,7 @@ attributes #3 = { nounwind memory(none) } ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP32]], align 4 ; CHECK-NEXT: [[TMP36:%.*]] = load [11 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP33]], [16 x i32] poison, [11 x i32] [[TMP36]]), !continuation.registercount [[META22]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP33]], [16 x i32] poison, [11 x i32] [[TMP36]]), !continuation.registercount [[META22]] ; CHECK-NEXT: unreachable ; ; @@ -256,7 +256,7 @@ attributes #3 = { nounwind memory(none) } ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; CHECK-NEXT: [[TMP46:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP45]], align 4 ; CHECK-NEXT: [[TMP53:%.*]] = load [14 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP46]], [16 x i32] poison, [14 x i32] [[TMP53]]), !continuation.registercount [[META19]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP46]], [16 x i32] poison, [14 x i32] [[TMP53]]), !continuation.registercount [[META19]] ; CHECK-NEXT: unreachable ; ; diff --git a/llvmraytracing/test/dx/lower-rt-pipeline.ll b/llvmraytracing/test/dx/lower-rt-pipeline.ll index b34786a0f1..e595b929d8 100644 --- a/llvmraytracing/test/dx/lower-rt-pipeline.ll +++ b/llvmraytracing/test/dx/lower-rt-pipeline.ll @@ -509,8 +509,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA36:![0-9]+]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0 @@ -563,15 +563,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP31]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP46]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[TMP28]], i64 3 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP32]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP33]], float [[TMP34]], float [[TMP35]], float [[TMP36]], i8 15) ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]] -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader( @@ -599,7 +600,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP42]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP12]], ptr [[TMP11]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP13]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP14]], ptr [[HITATTRS]], align 4 @@ -638,7 +639,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [33 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [33 x i32] poison, [10 x i32] [[TMP45]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -677,7 +678,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP26]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP22]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP22]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP11]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP11]], align 4 @@ -758,7 +759,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP53]], ptr [[TMP54]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP55:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP63:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [8 x i32] poison, [10 x i32] [[TMP63]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], [8 x i32] poison, [10 x i32] [[TMP63]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 64: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 @@ -789,7 +790,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP70]], ptr [[TMP71]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP72:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP84:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [8 x i32] poison, [10 x i32] [[TMP84]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]], [8 x i32] poison, [10 x i32] [[TMP84]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 85: ; LOWERRAYTRACINGPIPELINE-NEXT: br i1 [[TMP37]], label [[TMP74:%.*]], label [[TMP109:%.*]] @@ -825,7 +826,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP89]], ptr [[TMP90]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP91:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP132:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [8 x i32] poison, [10 x i32] [[TMP132]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]], [8 x i32] poison, [10 x i32] [[TMP132]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 109: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP93:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0 @@ -857,7 +858,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP106]], ptr [[TMP107]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP108:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP130:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [8 x i32] poison, [10 x i32] [[TMP130]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]], [8 x i32] poison, [10 x i32] [[TMP130]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 131: ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4 @@ -888,7 +889,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP122]], ptr [[TMP123]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP124:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP150:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [8 x i32] poison, [10 x i32] [[TMP150]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]], [8 x i32] poison, [10 x i32] [[TMP150]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -946,13 +947,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 23: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 26: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -1010,13 +1011,13 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 23: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 26: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -1064,7 +1065,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [33 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], [33 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; @@ -1160,8 +1161,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0 @@ -1213,15 +1214,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP38]], i8 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[TMP36]], i64 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP42:%.*]] = extractelement <4 x float> [[TMP36]], i64 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP43:%.*]] = extractelement <4 x float> [[TMP36]], i64 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP44:%.*]] = extractelement <4 x float> [[TMP36]], i64 3 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP40]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP41]], float [[TMP42]], float [[TMP43]], float [[TMP44]], i8 15) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader( @@ -1248,7 +1250,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP13]], ptr [[TMP11]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP14:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP14:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP14]], ptr [[TMP1]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP1]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP15]], ptr [[HITATTRS]], align 4 @@ -1325,7 +1327,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP22]]) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP22]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP9]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP9]], align 4 @@ -1801,8 +1803,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; POSTPROCESS-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; POSTPROCESS-NEXT: [[TMP1:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POSTPROCESS-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; POSTPROCESS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POSTPROCESS-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; POSTPROCESS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POSTPROCESS-NEXT: [[TMP4:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP3]]) ; POSTPROCESS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0 ; POSTPROCESS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -1881,8 +1883,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[VAL_1_I:%.*]] = insertelement <3 x i32> [[VAL_0_I]], i32 [[RES_2_I]], i32 1 ; POSTPROCESS-NEXT: [[VAL_2_I:%.*]] = insertelement <3 x i32> [[VAL_1_I]], i32 [[RES_3_I]], i32 2 ; POSTPROCESS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[VAL_2_I]], i8 1 -; POSTPROCESS-NEXT: [[TMP20:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP18]]) -; POSTPROCESS-NEXT: [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP20]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) +; POSTPROCESS-NEXT: [[TMP20:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP18]]) +; POSTPROCESS-NEXT: [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP20]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) ; POSTPROCESS-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0 ; POSTPROCESS-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1 ; POSTPROCESS-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2 @@ -1921,7 +1923,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP4]], i32 2 ; POSTPROCESS-NEXT: [[TMP5:%.*]] = bitcast i32 [[PAYLOAD_FCA_9_EXTRACT]] to float ; POSTPROCESS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP5]], i32 3 -; POSTPROCESS-NEXT: [[TMP6:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; POSTPROCESS-NEXT: [[TMP6:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP6]], 0 ; POSTPROCESS-NEXT: [[DOTSROA_012_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0 ; POSTPROCESS-NEXT: [[TMP7:%.*]] = bitcast float [[DOTSROA_012_0_VEC_EXTRACT]] to i32 @@ -2032,7 +2034,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-NEXT: [[TMP6:%.*]] = bitcast i32 [[PAYLOAD_FCA_9_EXTRACT]] to float ; POSTPROCESS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP6]], i32 3 ; POSTPROCESS-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 -; POSTPROCESS-NEXT: [[TMP8:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP7]]) +; POSTPROCESS-NEXT: [[TMP8:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP7]]) ; POSTPROCESS-NEXT: [[DOTFCA_0_EXTRACT401:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]], 0 ; POSTPROCESS-NEXT: [[DOTSROA_0403_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT401]], i32 0 ; POSTPROCESS-NEXT: [[TMP9:%.*]] = bitcast float [[DOTSROA_0403_0_VEC_EXTRACT]] to i32 @@ -3423,8 +3425,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; CLEANUP-CPS-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) -; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP4]]) ; CLEANUP-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0 ; CLEANUP-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -3501,8 +3503,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[VAL_1_I:%.*]] = insertelement <3 x i32> [[VAL_0_I]], i32 [[RES_2_I]], i32 1 ; CLEANUP-CPS-NEXT: [[VAL_2_I:%.*]] = insertelement <3 x i32> [[VAL_1_I]], i32 [[RES_3_I]], i32 2 ; CLEANUP-CPS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[VAL_2_I]], i8 1 -; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) -; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP14]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) +; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) +; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP14]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) ; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0 ; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1 ; CLEANUP-CPS-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2 @@ -3536,7 +3538,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP2]], i32 2 ; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = bitcast i32 [[PAYLOAD_FCA_9_EXTRACT]] to float ; CLEANUP-CPS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP3]], i32 3 -; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP4]], 0 ; CLEANUP-CPS-NEXT: [[DOTSROA_012_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0 ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = bitcast float [[DOTSROA_012_0_VEC_EXTRACT]] to i32 @@ -3643,7 +3645,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = bitcast i32 [[PAYLOAD_FCA_9_EXTRACT]] to float ; CLEANUP-CPS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP3]], i32 3 ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 -; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP4]]) +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP4]]) ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT388:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP5]], 0 ; CLEANUP-CPS-NEXT: [[DOTSROA_0390_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT388]], i32 0 ; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = bitcast float [[DOTSROA_0390_0_VEC_EXTRACT]] to i32 @@ -4982,8 +4984,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; POSTPROCESS-CPS-NEXT: [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP6]]) ; POSTPROCESS-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0 ; POSTPROCESS-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -5063,8 +5065,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[VAL_1_I:%.*]] = insertelement <3 x i32> [[VAL_0_I]], i32 [[RES_2_I]], i32 1 ; POSTPROCESS-CPS-NEXT: [[VAL_2_I:%.*]] = insertelement <3 x i32> [[VAL_1_I]], i32 [[RES_3_I]], i32 2 ; POSTPROCESS-CPS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[VAL_2_I]], i8 1 -; POSTPROCESS-CPS-NEXT: [[TMP14:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) -; POSTPROCESS-CPS-NEXT: [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP14]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) +; POSTPROCESS-CPS-NEXT: [[TMP14:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) +; POSTPROCESS-CPS-NEXT: [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP14]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) ; POSTPROCESS-CPS-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0 ; POSTPROCESS-CPS-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1 ; POSTPROCESS-CPS-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2 @@ -5100,7 +5102,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_4_VEC_INSERT]], float [[TMP2]], i32 2 ; POSTPROCESS-CPS-NEXT: [[TMP3:%.*]] = bitcast i32 [[PAYLOAD_FCA_9_EXTRACT]] to float ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP3]], i32 3 -; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP4]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_012_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = bitcast float [[DOTSROA_012_0_VEC_EXTRACT]] to i32 @@ -5211,7 +5213,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; POSTPROCESS-CPS-NEXT: [[TMP3:%.*]] = bitcast i32 [[PAYLOAD_FCA_9_EXTRACT]] to float ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP3]], i32 3 ; POSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 -; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP4]]) +; POSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP4]]) ; POSTPROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT387:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP5]], 0 ; POSTPROCESS-CPS-NEXT: [[DOTSROA_0389_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT387]], i32 0 ; POSTPROCESS-CPS-NEXT: [[TMP6:%.*]] = bitcast float [[DOTSROA_0389_0_VEC_EXTRACT]] to i32 diff --git a/llvmraytracing/test/dx/paq-hit-attribute-size.ll b/llvmraytracing/test/dx/paq-hit-attribute-size.ll index 745733c833..75389981e7 100644 --- a/llvmraytracing/test/dx/paq-hit-attribute-size.ll +++ b/llvmraytracing/test/dx/paq-hit-attribute-size.ll @@ -75,7 +75,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-1-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-1-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-1-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-1-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) ; CHECK-MAX-1-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-1-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-MAX-1-NEXT: store i32 [[TMP20]], ptr [[ORIGHITATTRS]], align 4 @@ -104,7 +104,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-1-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP34]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP33]]) ; CHECK-MAX-1-NEXT: [[TMP35:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-1-NEXT: [[TMP36:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-1-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META15]] +; CHECK-MAX-1-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META15]] ; CHECK-MAX-1-NEXT: unreachable ; ; CHECK-MAX-2-LABEL: define %struct.AnyHitSystemData @AnyHit1DWords( @@ -135,7 +135,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-2-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-2-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-2-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) ; CHECK-MAX-2-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-2-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-MAX-2-NEXT: store i32 [[TMP20]], ptr [[ORIGHITATTRS]], align 4 @@ -164,7 +164,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-2-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP34]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP33]]) ; CHECK-MAX-2-NEXT: [[TMP35:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-2-NEXT: [[TMP36:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-2-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META18]] +; CHECK-MAX-2-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP35]], [2 x i32] poison, [4 x i32] [[TMP36]]), !continuation.registercount [[META18]] ; CHECK-MAX-2-NEXT: unreachable ; ; CHECK-MAX-4-LABEL: define %struct.AnyHitSystemData @AnyHit1DWords( @@ -195,7 +195,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-4-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-4-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-4-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-4-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) ; CHECK-MAX-4-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_4_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; CHECK-MAX-4-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4 @@ -225,7 +225,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-4-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP35]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP34]]) ; CHECK-MAX-4-NEXT: [[TMP36:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-4-NEXT: [[TMP37:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-4-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [4 x i32] poison, [6 x i32] [[TMP37]]), !continuation.registercount [[META19]] +; CHECK-MAX-4-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [4 x i32] poison, [6 x i32] [[TMP37]]), !continuation.registercount [[META19]] ; CHECK-MAX-4-NEXT: unreachable ; ; CHECK-MAX-8-LABEL: define %struct.AnyHitSystemData @AnyHit1DWords( @@ -256,7 +256,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-8-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-8-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-8-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) ; CHECK-MAX-8-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; CHECK-MAX-8-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4 @@ -286,7 +286,7 @@ define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWord ; CHECK-MAX-8-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP35]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP34]]) ; CHECK-MAX-8-NEXT: [[TMP36:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-8-NEXT: [[TMP37:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-8-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [8 x i32] poison, [10 x i32] [[TMP37]]), !continuation.registercount [[META20]] +; CHECK-MAX-8-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP36]], [8 x i32] poison, [10 x i32] [[TMP37]]), !continuation.registercount [[META20]] ; CHECK-MAX-8-NEXT: unreachable ; ret void @@ -325,7 +325,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord ; CHECK-MAX-2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-2-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-2-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-2-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]]) ; CHECK-MAX-2-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-2-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-MAX-2-NEXT: store i32 [[TMP20]], ptr [[ORIGHITATTRS]], align 4 @@ -362,7 +362,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord ; CHECK-MAX-2-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP40]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP39]]) ; CHECK-MAX-2-NEXT: [[TMP41:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-2-NEXT: [[TMP42:%.*]] = load [4 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-2-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP41]], [2 x i32] poison, [4 x i32] [[TMP42]]), !continuation.registercount [[META18]] +; CHECK-MAX-2-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP41]], [2 x i32] poison, [4 x i32] [[TMP42]]), !continuation.registercount [[META18]] ; CHECK-MAX-2-NEXT: unreachable ; ; CHECK-MAX-4-LABEL: define %struct.AnyHitSystemData @AnyHit2DWords( @@ -393,7 +393,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord ; CHECK-MAX-4-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-4-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-4-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-4-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]]) ; CHECK-MAX-4-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_4_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; CHECK-MAX-4-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4 @@ -431,7 +431,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord ; CHECK-MAX-4-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP41]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP40]]) ; CHECK-MAX-4-NEXT: [[TMP42:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-4-NEXT: [[TMP43:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-4-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [4 x i32] poison, [6 x i32] [[TMP43]]), !continuation.registercount [[META19]] +; CHECK-MAX-4-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [4 x i32] poison, [6 x i32] [[TMP43]]), !continuation.registercount [[META19]] ; CHECK-MAX-4-NEXT: unreachable ; ; CHECK-MAX-8-LABEL: define %struct.AnyHitSystemData @AnyHit2DWords( @@ -462,7 +462,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord ; CHECK-MAX-8-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-8-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-8-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]]) ; CHECK-MAX-8-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; CHECK-MAX-8-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4 @@ -500,7 +500,7 @@ define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWord ; CHECK-MAX-8-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP41]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP40]]) ; CHECK-MAX-8-NEXT: [[TMP42:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-8-NEXT: [[TMP43:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-8-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [8 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META20]] +; CHECK-MAX-8-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP42]], [8 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META20]] ; CHECK-MAX-8-NEXT: unreachable ; ret void @@ -543,7 +543,7 @@ define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWord ; CHECK-MAX-4-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-4-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-4-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-4-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]]) ; CHECK-MAX-4-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_4_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; CHECK-MAX-4-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4 @@ -596,7 +596,7 @@ define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWord ; CHECK-MAX-4-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP52]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP51]]) ; CHECK-MAX-4-NEXT: [[TMP53:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-4-NEXT: [[TMP54:%.*]] = load [6 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-4-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [4 x i32] poison, [6 x i32] [[TMP54]]), !continuation.registercount [[META19]] +; CHECK-MAX-4-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [4 x i32] poison, [6 x i32] [[TMP54]]), !continuation.registercount [[META19]] ; CHECK-MAX-4-NEXT: unreachable ; ; CHECK-MAX-8-LABEL: define %struct.AnyHitSystemData @AnyHit4DWords( @@ -627,7 +627,7 @@ define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWord ; CHECK-MAX-8-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-8-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-8-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]]) ; CHECK-MAX-8-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; CHECK-MAX-8-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4 @@ -680,7 +680,7 @@ define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWord ; CHECK-MAX-8-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP52]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP51]]) ; CHECK-MAX-8-NEXT: [[TMP53:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-8-NEXT: [[TMP54:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-8-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [8 x i32] poison, [10 x i32] [[TMP54]]), !continuation.registercount [[META20]] +; CHECK-MAX-8-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP53]], [8 x i32] poison, [10 x i32] [[TMP54]]), !continuation.registercount [[META20]] ; CHECK-MAX-8-NEXT: unreachable ; ret void @@ -727,7 +727,7 @@ define void @AnyHit8DWords(%struct.MyPayload* %payload, %struct.Attributes8DWord ; CHECK-MAX-8-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; CHECK-MAX-8-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-MAX-8-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 -; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]]) +; CHECK-MAX-8-NEXT: [[TMP19:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES]](ptr [[TMP18]]) ; CHECK-MAX-8-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP19]], ptr [[TMP4]], align 4 ; CHECK-MAX-8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 0, i32 1 ; CHECK-MAX-8-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP4]], align 4 @@ -812,7 +812,7 @@ define void @AnyHit8DWords(%struct.MyPayload* %payload, %struct.Attributes8DWord ; CHECK-MAX-8-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP76]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP75]]) ; CHECK-MAX-8-NEXT: [[TMP77:%.*]] = load [[STRUCT_ANYHITSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-MAX-8-NEXT: [[TMP78:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-MAX-8-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITSYSTEMDATA]] [[TMP77]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META20]] +; CHECK-MAX-8-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITSYSTEMDATA]] [[TMP77]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META20]] ; CHECK-MAX-8-NEXT: unreachable ; ret void diff --git a/llvmraytracing/test/dx/payload-caller-in-paq.ll b/llvmraytracing/test/dx/payload-caller-in-paq.ll index 303a0dd4ce..4bf8d2575f 100644 --- a/llvmraytracing/test/dx/payload-caller-in-paq.ll +++ b/llvmraytracing/test/dx/payload-caller-in-paq.ll @@ -46,8 +46,8 @@ define void @RayGen() #0 { ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR0]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store float 1.000000e+00, ptr [[TMP6]], align 8, !tbaa [[TBAA28:![0-9]+]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0 @@ -58,7 +58,7 @@ define void @RayGen() #0 { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP10]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP33]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP12:%.*]] = load [1 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP12]]), !continuation.registercount [[META32:![0-9]+]], !continuation.wait.await [[META13]], !continuation.returnedRegistercount [[META25:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [1 x i32] [[TMP12]]), !continuation.registercount [[META32:![0-9]+]], !waitmask [[META33:![0-9]+]], !continuation.returnedRegistercount [[META25:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [12 x i32], [3 x i32] } @await(ptr [[TMP17]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [12 x i32], [3 x i32] } [[TMP20]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [3 x i32] [[TMP15]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -85,20 +85,21 @@ define void @RayGen() #0 { ; LOWERRAYTRACINGPIPELINE: .split: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP6]], align 8, !tbaa [[TBAA28]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA33:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA34:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = sitofp i32 [[TMP44]] to float ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 2 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8, !tbaa [[TBAA35:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8, !tbaa [[TBAA36:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = fptrunc double [[TMP47]] to float ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP49:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP49]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP50:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP50]], i8 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP37]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP37]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) ; LOWERRAYTRACINGPIPELINE-NEXT: call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP52]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP42]], float [[TMP45]], float [[TMP48]], float 0.000000e+00, i8 15) ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR0]] -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4 %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 diff --git a/llvmraytracing/test/dx/payload-save-registers.ll b/llvmraytracing/test/dx/payload-save-registers.ll index e1429dc68b..21a0ee7046 100644 --- a/llvmraytracing/test/dx/payload-save-registers.ll +++ b/llvmraytracing/test/dx/payload-save-registers.ll @@ -57,28 +57,28 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP44]], ptr [[TMP12]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = load i32, ptr [[TMP13]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP67]], ptr [[TMP14]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP13]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP51]], ptr [[TMP14]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 5 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 5 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP15]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP68]], ptr [[TMP16]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP67:%.*]] = load i32, ptr [[TMP15]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP67]], ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 6 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = load i32, ptr [[TMP17]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP69]], ptr [[TMP18]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP17]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP68]], ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 7 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 7 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP19]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP70]], ptr [[TMP20]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP69:%.*]] = load i32, ptr [[TMP19]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP69]], ptr [[TMP20]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP21]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP31]], ptr [[TMP22]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 9 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 9 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP23]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP71]], ptr [[TMP24]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP23]], align 4 +; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP70]], ptr [[TMP24]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 10 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 10 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP25]], align 4 @@ -122,11 +122,11 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = bitcast ptr [[TMP46]] to ptr ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP47]]) #[[ATTR0]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0, i32 14 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA27:![0-9]+]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP71:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA27:![0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0, i32 0 -; LOWERRAYTRACINGPIPELINE-NEXT: store float [[TMP51]], ptr [[TMP50]], align 4, !tbaa [[TBAA27]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP73]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: store float [[TMP71]], ptr [[TMP50]], align 4, !tbaa [[TBAA27]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP73:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP73]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP52]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP54]], align 4 @@ -233,7 +233,7 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !poi ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP95:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP94]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP143:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [10 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META23]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], [10 x i32] poison, [30 x i32] [[TMP143]]), !continuation.registercount [[META23]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4 @@ -934,7 +934,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !pointeety ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP484]], ptr [[TMP482]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP382:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP486:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [10 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META23]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], [10 x i32] poison, [30 x i32] [[TMP486]]), !continuation.registercount [[META23]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; %1 = alloca %struct.OuterPayload, align 8 diff --git a/llvmraytracing/test/dx/payload.ll b/llvmraytracing/test/dx/payload.ll index c64fe6ec79..cc424b6621 100644 --- a/llvmraytracing/test/dx/payload.ll +++ b/llvmraytracing/test/dx/payload.ll @@ -1,9 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3 -; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s -; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \ +; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | grep -v lgc.cps.module | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s +; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | grep -v lgc.cps.module | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \ ; RUN: -S --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS %s -; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \ -; RUN: -S %s --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS-GLOBAL %s +; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \ +; RUN: -S --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS-GLOBAL %s +; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,continuations-lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-CPS %s +; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \ +; RUN: -S --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS-CPS %s +; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' \ +; RUN: -S %s --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS-GLOBAL-CPS %s target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32" @@ -148,6 +153,7 @@ attributes #3 = { nounwind } !dx.shaderModel = !{!2} !dx.entryPoints = !{!3, !6, !13, !15} !continuation.stackAddrspace = !{!36} ; SKIP_GLOBAL_ADDRSPACE +!lgc.cps.module = !{} !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"} !1 = !{i32 1, i32 6} @@ -207,8 +213,8 @@ attributes #3 = { nounwind } ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CLEANUP-NEXT: [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; CLEANUP-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; CLEANUP-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) -; CLEANUP-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) +; CLEANUP-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-NEXT: [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP4]]) ; CLEANUP-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT56]], 0 ; CLEANUP-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -301,7 +307,7 @@ attributes #3 = { nounwind } ; CLEANUP-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 undef, 28 ; CLEANUP-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 undef, 29 ; CLEANUP-NEXT: [[TMP34:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP34]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17:![0-9]+]], !continuation.returnedRegistercount [[META17]], !waitmask [[META22:![0-9]+]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP34]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17:![0-9]+]], !waitmask [[META22:![0-9]+]], !continuation.returnedRegistercount [[META17]] ; CLEANUP-NEXT: unreachable ; ; @@ -662,7 +668,7 @@ attributes #3 = { nounwind } ; CLEANUP-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27 ; CLEANUP-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 -; CLEANUP-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR]], i32 poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]] ; CLEANUP-NEXT: unreachable ; ; @@ -770,8 +776,8 @@ attributes #3 = { nounwind } ; CLEANUP-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; CLEANUP-NEXT: [[TMP59:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; CLEANUP-NEXT: [[TMP65:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; CLEANUP-NEXT: [[TMP67:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP59]]) -; CLEANUP-NEXT: [[TMP69:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP67]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-NEXT: [[TMP67:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP59]]) +; CLEANUP-NEXT: [[TMP69:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP67]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-NEXT: [[TMP61:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP69]]) ; CLEANUP-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0 ; CLEANUP-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -864,7 +870,7 @@ attributes #3 = { nounwind } ; CLEANUP-NEXT: [[DOTFCA_28_INSERT138:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT135]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-NEXT: [[DOTFCA_29_INSERT141:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT138]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-NEXT: [[TMP116:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @ClosestHit.resume.0) -; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP116]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount [[META17]], !waitmask [[META22]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 4, i32 -1, {} poison, i64 [[TMP116]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]), !continuation.registercount [[META17]], !waitmask [[META22]], !continuation.returnedRegistercount [[META17]] ; CLEANUP-NEXT: unreachable ; ; @@ -1052,7 +1058,7 @@ attributes #3 = { nounwind } ; CLEANUP-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[TMP23]], 28 ; CLEANUP-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 ; CLEANUP-NEXT: call void @lgc.cps.free(i32 120) -; CLEANUP-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]] +; CLEANUP-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR_RELOAD]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]] ; CLEANUP-NEXT: unreachable ; ; @@ -1080,8 +1086,8 @@ attributes #3 = { nounwind } ; POST-PROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; POST-PROCESS-NEXT: [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; POST-PROCESS-NEXT: [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POST-PROCESS-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; POST-PROCESS-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; POST-PROCESS-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POST-PROCESS-NEXT: [[TMP7:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP6]]) ; POST-PROCESS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT56]], 0 ; POST-PROCESS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -1920,8 +1926,8 @@ attributes #3 = { nounwind } ; POST-PROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; POST-PROCESS-NEXT: [[TMP118:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; POST-PROCESS-NEXT: [[TMP119:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POST-PROCESS-NEXT: [[TMP120:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP118]]) -; POST-PROCESS-NEXT: [[TMP121:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP120]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-NEXT: [[TMP120:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP118]]) +; POST-PROCESS-NEXT: [[TMP121:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP120]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POST-PROCESS-NEXT: [[TMP122:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP121]]) ; POST-PROCESS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0 ; POST-PROCESS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -2398,8 +2404,8 @@ attributes #3 = { nounwind } ; POST-PROCESS-GLOBAL-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; POST-PROCESS-GLOBAL-NEXT: [[TMP5:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; POST-PROCESS-GLOBAL-NEXT: [[TMP6:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POST-PROCESS-GLOBAL-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP5]]) -; POST-PROCESS-GLOBAL-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-GLOBAL-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP5]]) +; POST-PROCESS-GLOBAL-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POST-PROCESS-GLOBAL-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; POST-PROCESS-GLOBAL-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT56]], 0 ; POST-PROCESS-GLOBAL-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -3107,8 +3113,8 @@ attributes #3 = { nounwind } ; POST-PROCESS-GLOBAL-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; POST-PROCESS-GLOBAL-NEXT: [[TMP91:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; POST-PROCESS-GLOBAL-NEXT: [[TMP92:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POST-PROCESS-GLOBAL-NEXT: [[TMP93:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP91]]) -; POST-PROCESS-GLOBAL-NEXT: [[TMP94:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP93]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-GLOBAL-NEXT: [[TMP93:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP91]]) +; POST-PROCESS-GLOBAL-NEXT: [[TMP94:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP93]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POST-PROCESS-GLOBAL-NEXT: [[TMP95:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP94]]) ; POST-PROCESS-GLOBAL-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0 ; POST-PROCESS-GLOBAL-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -3477,3 +3483,3288 @@ attributes #3 = { nounwind } ; POST-PROCESS-GLOBAL-NEXT: call void (...) @lgc.ilcps.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP170]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) ; POST-PROCESS-GLOBAL-NEXT: unreachable ; +; +; CLEANUP-CPS-LABEL: define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes( +; CLEANUP-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; CLEANUP-CPS-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 +; CLEANUP-CPS-NEXT: [[VAL:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], ptr [[ADDR]], align 4 +; CLEANUP-CPS-NEXT: ret [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]] +; +; +; CLEANUP-CPS-LABEL: define i32 @_cont_GetLocalRootIndex( +; CLEANUP-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; CLEANUP-CPS-NEXT: ret i32 5 +; +; +; CLEANUP-CPS-LABEL: define void @main( +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] { +; CLEANUP-CPS-NEXT: AllocaSpillBB: +; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 108) +; CLEANUP-CPS-NEXT: [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MAIN_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT56:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 +; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) +; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 +; CLEANUP-CPS-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 +; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP4]]) +; CLEANUP-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT56]], 0 +; CLEANUP-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 +; CLEANUP-CPS-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0) +; CLEANUP-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP6]], 5 +; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32 +; CLEANUP-CPS-NEXT: [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP8]], align 4 +; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 1 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP9]], align 4 +; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 2 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP10]], align 4 +; CLEANUP-CPS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 3 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP11]], align 4 +; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 4 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP12]], align 4 +; CLEANUP-CPS-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 5 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP13]], align 4 +; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 6 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP14]], align 4 +; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 7 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP15]], align 4 +; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 8 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP16]], align 4 +; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 9 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP17]], align 4 +; CLEANUP-CPS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 10 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP18]], align 4 +; CLEANUP-CPS-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 11 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP19]], align 4 +; CLEANUP-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 12 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP20]], align 4 +; CLEANUP-CPS-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 13 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP21]], align 4 +; CLEANUP-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 14 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP22]], align 4 +; CLEANUP-CPS-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 15 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP23]], align 4 +; CLEANUP-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 16 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP24]], align 4 +; CLEANUP-CPS-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 17 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP25]], align 4 +; CLEANUP-CPS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 18 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP26]], align 4 +; CLEANUP-CPS-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 19 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP27]], align 4 +; CLEANUP-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 20 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP28]], align 4 +; CLEANUP-CPS-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 21 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP29]], align 4 +; CLEANUP-CPS-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 22 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP30]], align 4 +; CLEANUP-CPS-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 23 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP31]], align 4 +; CLEANUP-CPS-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 24 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP32]], align 4 +; CLEANUP-CPS-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 25 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP33]], align 4 +; CLEANUP-CPS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP8]], i32 26 +; CLEANUP-CPS-NEXT: store i32 undef, ptr addrspace(32) [[TMP34]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[TMP7]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6 +; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 undef, 7 +; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 undef, 8 +; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 undef, 9 +; CLEANUP-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 undef, 10 +; CLEANUP-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 undef, 11 +; CLEANUP-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 undef, 12 +; CLEANUP-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 undef, 13 +; CLEANUP-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 undef, 14 +; CLEANUP-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 undef, 15 +; CLEANUP-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 undef, 16 +; CLEANUP-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 undef, 17 +; CLEANUP-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 undef, 18 +; CLEANUP-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 undef, 19 +; CLEANUP-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 undef, 20 +; CLEANUP-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 undef, 21 +; CLEANUP-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 undef, 22 +; CLEANUP-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 undef, 23 +; CLEANUP-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 undef, 24 +; CLEANUP-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 undef, 25 +; CLEANUP-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 undef, 26 +; CLEANUP-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 undef, 27 +; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 undef, 28 +; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 undef, 29 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !waitmask [[META22:![0-9]+]], !continuation.returnedRegistercount [[META17:![0-9]+]], !continuation.registercount [[META17]] +; CLEANUP-CPS-NEXT: unreachable +; +; +; CLEANUP-CPS-LABEL: define dso_local void @main.resume.0( +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META19]] !continuation [[META20]] { +; CLEANUP-CPS-NEXT: entryresume.0: +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 108) +; CLEANUP-CPS-NEXT: [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MAIN_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 6 +; CLEANUP-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 7 +; CLEANUP-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 8 +; CLEANUP-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 9 +; CLEANUP-CPS-NEXT: [[DOTFCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 10 +; CLEANUP-CPS-NEXT: [[DOTFCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 11 +; CLEANUP-CPS-NEXT: [[DOTFCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 12 +; CLEANUP-CPS-NEXT: [[DOTFCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 13 +; CLEANUP-CPS-NEXT: [[DOTFCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 14 +; CLEANUP-CPS-NEXT: [[DOTFCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 15 +; CLEANUP-CPS-NEXT: [[DOTFCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 16 +; CLEANUP-CPS-NEXT: [[DOTFCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 17 +; CLEANUP-CPS-NEXT: [[DOTFCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 18 +; CLEANUP-CPS-NEXT: [[DOTFCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 19 +; CLEANUP-CPS-NEXT: [[DOTFCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 20 +; CLEANUP-CPS-NEXT: [[DOTFCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 21 +; CLEANUP-CPS-NEXT: [[DOTFCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 22 +; CLEANUP-CPS-NEXT: [[DOTFCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 23 +; CLEANUP-CPS-NEXT: [[DOTFCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 24 +; CLEANUP-CPS-NEXT: [[DOTFCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 25 +; CLEANUP-CPS-NEXT: [[DOTFCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 26 +; CLEANUP-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27 +; CLEANUP-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28 +; CLEANUP-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4 +; CLEANUP-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 1 +; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4 +; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 2 +; CLEANUP-CPS-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4 +; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 3 +; CLEANUP-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4 +; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 4 +; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4 +; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 5 +; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4 +; CLEANUP-CPS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 6 +; CLEANUP-CPS-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4 +; CLEANUP-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 7 +; CLEANUP-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4 +; CLEANUP-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 8 +; CLEANUP-CPS-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4 +; CLEANUP-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 9 +; CLEANUP-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4 +; CLEANUP-CPS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 10 +; CLEANUP-CPS-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4 +; CLEANUP-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 11 +; CLEANUP-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4 +; CLEANUP-CPS-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 12 +; CLEANUP-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4 +; CLEANUP-CPS-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 13 +; CLEANUP-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4 +; CLEANUP-CPS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 14 +; CLEANUP-CPS-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4 +; CLEANUP-CPS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 15 +; CLEANUP-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4 +; CLEANUP-CPS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 16 +; CLEANUP-CPS-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4 +; CLEANUP-CPS-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 17 +; CLEANUP-CPS-NEXT: [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4 +; CLEANUP-CPS-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 18 +; CLEANUP-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4 +; CLEANUP-CPS-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 19 +; CLEANUP-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4 +; CLEANUP-CPS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 20 +; CLEANUP-CPS-NEXT: [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4 +; CLEANUP-CPS-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 21 +; CLEANUP-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4 +; CLEANUP-CPS-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 22 +; CLEANUP-CPS-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4 +; CLEANUP-CPS-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 23 +; CLEANUP-CPS-NEXT: [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4 +; CLEANUP-CPS-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 24 +; CLEANUP-CPS-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4 +; CLEANUP-CPS-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 25 +; CLEANUP-CPS-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4 +; CLEANUP-CPS-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 26 +; CLEANUP-CPS-NEXT: [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4 +; CLEANUP-CPS-NEXT: [[TMP60:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP61]], 0 +; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) +; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 108) +; CLEANUP-CPS-NEXT: ret void +; +; +; CLEANUP-CPS-LABEL: define void @AnyHit( +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !lgc.cps [[META24:![0-9]+]] !continuation [[META25:![0-9]+]] { +; CLEANUP-CPS-NEXT: AllocaSpillBB: +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 0, 0 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 +; CLEANUP-CPS-NEXT: store <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_0_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 1, 0 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 1, i32 0 +; CLEANUP-CPS-NEXT: store <2 x float> [[SYSTEM_DATA_FCA_0_0_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_1_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 1, 0 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 0 +; CLEANUP-CPS-NEXT: store float [[SYSTEM_DATA_FCA_0_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_1_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 1, 1 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 1 +; CLEANUP-CPS-NEXT: store i32 [[SYSTEM_DATA_FCA_0_1_1_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_1_1_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 2 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_2_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 2 +; CLEANUP-CPS-NEXT: store <3 x float> [[SYSTEM_DATA_FCA_0_2_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_2_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 3 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 3 +; CLEANUP-CPS-NEXT: store <3 x float> [[SYSTEM_DATA_FCA_0_3_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_3_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_4_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 4 +; CLEANUP-CPS-NEXT: store float [[SYSTEM_DATA_FCA_0_4_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_4_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 5 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_5_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 5 +; CLEANUP-CPS-NEXT: store i64 [[SYSTEM_DATA_FCA_0_5_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_5_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 1, 0 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0 +; CLEANUP-CPS-NEXT: store float [[SYSTEM_DATA_FCA_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_1_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 1, 1 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1 +; CLEANUP-CPS-NEXT: store i32 [[SYSTEM_DATA_FCA_1_1_EXTRACT]], ptr [[SYSTEM_DATA_FCA_1_1_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(32) [[TMP0]], align 4 +; CLEANUP-CPS-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 1 +; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4 +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 2 +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(32) [[TMP4]], align 4 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 3 +; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4 +; CLEANUP-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 4 +; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4 +; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 5 +; CLEANUP-CPS-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4 +; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 6 +; CLEANUP-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4 +; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 7 +; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4 +; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 8 +; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4 +; CLEANUP-CPS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 9 +; CLEANUP-CPS-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4 +; CLEANUP-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 10 +; CLEANUP-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4 +; CLEANUP-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 11 +; CLEANUP-CPS-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4 +; CLEANUP-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 12 +; CLEANUP-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4 +; CLEANUP-CPS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 13 +; CLEANUP-CPS-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4 +; CLEANUP-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 14 +; CLEANUP-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4 +; CLEANUP-CPS-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 15 +; CLEANUP-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4 +; CLEANUP-CPS-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 16 +; CLEANUP-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4 +; CLEANUP-CPS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 17 +; CLEANUP-CPS-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4 +; CLEANUP-CPS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 18 +; CLEANUP-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4 +; CLEANUP-CPS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 19 +; CLEANUP-CPS-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4 +; CLEANUP-CPS-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 20 +; CLEANUP-CPS-NEXT: [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4 +; CLEANUP-CPS-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 21 +; CLEANUP-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4 +; CLEANUP-CPS-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 22 +; CLEANUP-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4 +; CLEANUP-CPS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 23 +; CLEANUP-CPS-NEXT: [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4 +; CLEANUP-CPS-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 24 +; CLEANUP-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4 +; CLEANUP-CPS-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 25 +; CLEANUP-CPS-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4 +; CLEANUP-CPS-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 26 +; CLEANUP-CPS-NEXT: [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4 +; CLEANUP-CPS-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP54]], i32 0, i32 1 +; CLEANUP-CPS-NEXT: [[VAL_I_FCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[ADDR_I]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[VAL_I_FCA_0_LOAD:%.*]] = load <2 x float>, ptr [[VAL_I_FCA_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[VAL_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[VAL_I_FCA_0_LOAD]], 0 +; CLEANUP-CPS-NEXT: [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I_FCA_0_INSERT]], 0 +; CLEANUP-CPS-NEXT: [[DOTSROA_025_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 0 +; CLEANUP-CPS-NEXT: [[TMP55:%.*]] = bitcast float [[DOTSROA_025_0_VEC_EXTRACT]] to i32 +; CLEANUP-CPS-NEXT: [[DOTSROA_025_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 1 +; CLEANUP-CPS-NEXT: [[TMP56:%.*]] = bitcast float [[DOTSROA_025_4_VEC_EXTRACT]] to i32 +; CLEANUP-CPS-NEXT: [[HIT_ATTRS_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[HIT_ATTRS]], 0 +; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) +; CLEANUP-CPS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) +; CLEANUP-CPS-NEXT: [[TMP57:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: store i32 [[TMP1]], ptr addrspace(32) [[TMP57]], align 4 +; CLEANUP-CPS-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 1 +; CLEANUP-CPS-NEXT: store i32 [[TMP3]], ptr addrspace(32) [[TMP58]], align 4 +; CLEANUP-CPS-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 2 +; CLEANUP-CPS-NEXT: store i32 [[TMP5]], ptr addrspace(32) [[TMP59]], align 4 +; CLEANUP-CPS-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 3 +; CLEANUP-CPS-NEXT: store i32 [[TMP7]], ptr addrspace(32) [[TMP60]], align 4 +; CLEANUP-CPS-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 4 +; CLEANUP-CPS-NEXT: store i32 [[TMP9]], ptr addrspace(32) [[TMP61]], align 4 +; CLEANUP-CPS-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 5 +; CLEANUP-CPS-NEXT: store i32 [[TMP11]], ptr addrspace(32) [[TMP62]], align 4 +; CLEANUP-CPS-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 6 +; CLEANUP-CPS-NEXT: store i32 [[TMP13]], ptr addrspace(32) [[TMP63]], align 4 +; CLEANUP-CPS-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 7 +; CLEANUP-CPS-NEXT: store i32 [[TMP15]], ptr addrspace(32) [[TMP64]], align 4 +; CLEANUP-CPS-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 8 +; CLEANUP-CPS-NEXT: store i32 [[TMP17]], ptr addrspace(32) [[TMP65]], align 4 +; CLEANUP-CPS-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 9 +; CLEANUP-CPS-NEXT: store i32 [[TMP19]], ptr addrspace(32) [[TMP66]], align 4 +; CLEANUP-CPS-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 10 +; CLEANUP-CPS-NEXT: store i32 [[TMP21]], ptr addrspace(32) [[TMP67]], align 4 +; CLEANUP-CPS-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 11 +; CLEANUP-CPS-NEXT: store i32 [[TMP23]], ptr addrspace(32) [[TMP68]], align 4 +; CLEANUP-CPS-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 12 +; CLEANUP-CPS-NEXT: store i32 [[TMP25]], ptr addrspace(32) [[TMP69]], align 4 +; CLEANUP-CPS-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 13 +; CLEANUP-CPS-NEXT: store i32 [[TMP27]], ptr addrspace(32) [[TMP70]], align 4 +; CLEANUP-CPS-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 14 +; CLEANUP-CPS-NEXT: store i32 [[TMP29]], ptr addrspace(32) [[TMP71]], align 4 +; CLEANUP-CPS-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 15 +; CLEANUP-CPS-NEXT: store i32 [[TMP31]], ptr addrspace(32) [[TMP72]], align 4 +; CLEANUP-CPS-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 16 +; CLEANUP-CPS-NEXT: store i32 [[TMP33]], ptr addrspace(32) [[TMP73]], align 4 +; CLEANUP-CPS-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 17 +; CLEANUP-CPS-NEXT: store i32 [[TMP35]], ptr addrspace(32) [[TMP74]], align 4 +; CLEANUP-CPS-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 18 +; CLEANUP-CPS-NEXT: store i32 [[TMP37]], ptr addrspace(32) [[TMP75]], align 4 +; CLEANUP-CPS-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 19 +; CLEANUP-CPS-NEXT: store i32 [[TMP39]], ptr addrspace(32) [[TMP76]], align 4 +; CLEANUP-CPS-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 20 +; CLEANUP-CPS-NEXT: store i32 [[TMP41]], ptr addrspace(32) [[TMP77]], align 4 +; CLEANUP-CPS-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 21 +; CLEANUP-CPS-NEXT: store i32 [[TMP43]], ptr addrspace(32) [[TMP78]], align 4 +; CLEANUP-CPS-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 22 +; CLEANUP-CPS-NEXT: store i32 [[TMP45]], ptr addrspace(32) [[TMP79]], align 4 +; CLEANUP-CPS-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 23 +; CLEANUP-CPS-NEXT: store i32 [[TMP47]], ptr addrspace(32) [[TMP80]], align 4 +; CLEANUP-CPS-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 24 +; CLEANUP-CPS-NEXT: store i32 [[TMP49]], ptr addrspace(32) [[TMP81]], align 4 +; CLEANUP-CPS-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 25 +; CLEANUP-CPS-NEXT: store i32 [[TMP51]], ptr addrspace(32) [[TMP82]], align 4 +; CLEANUP-CPS-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP57]], i32 26 +; CLEANUP-CPS-NEXT: store i32 [[TMP53]], ptr addrspace(32) [[TMP83]], align 4 +; CLEANUP-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0 +; CLEANUP-CPS-NEXT: [[TMP84:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32 +; CLEANUP-CPS-NEXT: [[TMP85:%.*]] = bitcast i32 [[TMP84]] to float +; CLEANUP-CPS-NEXT: [[DOTSROA_027_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP85]], i32 0 +; CLEANUP-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1 +; CLEANUP-CPS-NEXT: [[TMP86:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32 +; CLEANUP-CPS-NEXT: [[TMP87:%.*]] = bitcast i32 [[TMP86]] to float +; CLEANUP-CPS-NEXT: [[DOTSROA_027_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_027_0_VEC_INSERT]], float [[TMP87]], i32 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT26:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_027_4_VEC_INSERT]], 0 +; CLEANUP-CPS-NEXT: [[TMP88:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 +; CLEANUP-CPS-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP88]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT26]]) +; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_0_0_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_0_LOAD]], 0, 0, 0, 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 1, i32 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_1_0_LOAD:%.*]] = load <2 x float>, ptr [[DOTFCA_0_0_1_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_0_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_0_0_0_INSERT]], <2 x float> [[DOTFCA_0_0_1_0_LOAD]], 0, 0, 1, 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_0_LOAD:%.*]] = load float, ptr [[DOTFCA_0_1_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_0_1_0_INSERT]], float [[DOTFCA_0_1_0_LOAD]], 0, 1, 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_0_1_1_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_1_0_INSERT]], i32 [[DOTFCA_0_1_1_LOAD]], 0, 1, 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_2_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_2_LOAD:%.*]] = load <3 x float>, ptr [[DOTFCA_0_2_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_1_1_INSERT]], <3 x float> [[DOTFCA_0_2_LOAD]], 0, 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_3_LOAD:%.*]] = load <3 x float>, ptr [[DOTFCA_0_3_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_2_INSERT]], <3 x float> [[DOTFCA_0_3_LOAD]], 0, 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_4_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_4_LOAD:%.*]] = load float, ptr [[DOTFCA_0_4_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_4_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_3_INSERT]], float [[DOTFCA_0_4_LOAD]], 0, 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_5_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_5_LOAD:%.*]] = load i64, ptr [[DOTFCA_0_5_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_5_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_4_INSERT]], i64 [[DOTFCA_0_5_LOAD]], 0, 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_0_LOAD:%.*]] = load float, ptr [[DOTFCA_1_0_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_5_INSERT]], float [[DOTFCA_1_0_LOAD]], 1, 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 +; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7 +; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8 +; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9 +; CLEANUP-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 [[PAYLOAD_FCA_10_EXTRACT]], 10 +; CLEANUP-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 [[PAYLOAD_FCA_11_EXTRACT]], 11 +; CLEANUP-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 [[PAYLOAD_FCA_12_EXTRACT]], 12 +; CLEANUP-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 [[PAYLOAD_FCA_13_EXTRACT]], 13 +; CLEANUP-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 [[PAYLOAD_FCA_14_EXTRACT]], 14 +; CLEANUP-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 [[PAYLOAD_FCA_15_EXTRACT]], 15 +; CLEANUP-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 [[PAYLOAD_FCA_16_EXTRACT]], 16 +; CLEANUP-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 [[PAYLOAD_FCA_17_EXTRACT]], 17 +; CLEANUP-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 [[PAYLOAD_FCA_18_EXTRACT]], 18 +; CLEANUP-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 [[PAYLOAD_FCA_19_EXTRACT]], 19 +; CLEANUP-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 [[PAYLOAD_FCA_20_EXTRACT]], 20 +; CLEANUP-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 [[PAYLOAD_FCA_21_EXTRACT]], 21 +; CLEANUP-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 [[PAYLOAD_FCA_22_EXTRACT]], 22 +; CLEANUP-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 [[PAYLOAD_FCA_23_EXTRACT]], 23 +; CLEANUP-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 [[PAYLOAD_FCA_24_EXTRACT]], 24 +; CLEANUP-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 [[PAYLOAD_FCA_25_EXTRACT]], 25 +; CLEANUP-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 [[PAYLOAD_FCA_26_EXTRACT]], 26 +; CLEANUP-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27 +; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 +; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]] +; CLEANUP-CPS-NEXT: unreachable +; +; +; CLEANUP-CPS-LABEL: define void @ClosestHit( +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META26:![0-9]+]] !lgc.cps [[META23]] !continuation [[META27:![0-9]+]] !continuation.stacksize [[META21]] { +; CLEANUP-CPS-NEXT: AllocaSpillBB: +; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 116) +; CLEANUP-CPS-NEXT: [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1 +; CLEANUP-CPS-NEXT: store i32 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT_SPILL_ADDR:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 2 +; CLEANUP-CPS-NEXT: store i32 [[PAYLOAD_FCA_0_EXTRACT]], ptr addrspace(32) [[PAYLOAD_FCA_0_EXTRACT_SPILL_ADDR]], align 4 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 +; CLEANUP-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 1, 0 +; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(32) [[TMP0]], align 4 +; CLEANUP-CPS-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 1 +; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4 +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 2 +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(32) [[TMP4]], align 4 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 3 +; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4 +; CLEANUP-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 4 +; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4 +; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 5 +; CLEANUP-CPS-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4 +; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 6 +; CLEANUP-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4 +; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 7 +; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4 +; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 8 +; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4 +; CLEANUP-CPS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 9 +; CLEANUP-CPS-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4 +; CLEANUP-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 10 +; CLEANUP-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4 +; CLEANUP-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 11 +; CLEANUP-CPS-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4 +; CLEANUP-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 12 +; CLEANUP-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4 +; CLEANUP-CPS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 13 +; CLEANUP-CPS-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4 +; CLEANUP-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 14 +; CLEANUP-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4 +; CLEANUP-CPS-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 15 +; CLEANUP-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4 +; CLEANUP-CPS-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 16 +; CLEANUP-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4 +; CLEANUP-CPS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 17 +; CLEANUP-CPS-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4 +; CLEANUP-CPS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 18 +; CLEANUP-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4 +; CLEANUP-CPS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 19 +; CLEANUP-CPS-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4 +; CLEANUP-CPS-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 20 +; CLEANUP-CPS-NEXT: [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4 +; CLEANUP-CPS-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 21 +; CLEANUP-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4 +; CLEANUP-CPS-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 22 +; CLEANUP-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4 +; CLEANUP-CPS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 23 +; CLEANUP-CPS-NEXT: [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4 +; CLEANUP-CPS-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 24 +; CLEANUP-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4 +; CLEANUP-CPS-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 25 +; CLEANUP-CPS-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4 +; CLEANUP-CPS-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP0]], i32 26 +; CLEANUP-CPS-NEXT: [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4 +; CLEANUP-CPS-NEXT: [[VAL_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> [[SYSTEM_DATA_FCA_1_0_EXTRACT]], 0 +; CLEANUP-CPS-NEXT: [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I_FCA_0_INSERT]], 0 +; CLEANUP-CPS-NEXT: [[DOTSROA_0257_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 0 +; CLEANUP-CPS-NEXT: [[TMP54:%.*]] = bitcast float [[DOTSROA_0257_0_VEC_EXTRACT]] to i32 +; CLEANUP-CPS-NEXT: [[DOTSROA_0257_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 1 +; CLEANUP-CPS-NEXT: [[TMP55:%.*]] = bitcast float [[DOTSROA_0257_4_VEC_EXTRACT]] to i32 +; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) +; CLEANUP-CPS-NEXT: [[TMP56:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 +; CLEANUP-CPS-NEXT: [[TMP57:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 +; CLEANUP-CPS-NEXT: [[TMP58:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP56]]) +; CLEANUP-CPS-NEXT: [[TMP59:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP58]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-CPS-NEXT: [[TMP60:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP59]]) +; CLEANUP-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0 +; CLEANUP-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 +; CLEANUP-CPS-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 +; CLEANUP-CPS-NEXT: [[TMP61:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @ClosestHit.resume.0) +; CLEANUP-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP61]], 5 +; CLEANUP-CPS-NEXT: [[TMP62:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32 +; CLEANUP-CPS-NEXT: [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: store i32 [[TMP1]], ptr addrspace(32) [[TMP63]], align 4 +; CLEANUP-CPS-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 1 +; CLEANUP-CPS-NEXT: store i32 [[TMP3]], ptr addrspace(32) [[TMP64]], align 4 +; CLEANUP-CPS-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 2 +; CLEANUP-CPS-NEXT: store i32 [[TMP5]], ptr addrspace(32) [[TMP65]], align 4 +; CLEANUP-CPS-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 3 +; CLEANUP-CPS-NEXT: store i32 [[TMP7]], ptr addrspace(32) [[TMP66]], align 4 +; CLEANUP-CPS-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 4 +; CLEANUP-CPS-NEXT: store i32 [[TMP9]], ptr addrspace(32) [[TMP67]], align 4 +; CLEANUP-CPS-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 5 +; CLEANUP-CPS-NEXT: store i32 [[TMP11]], ptr addrspace(32) [[TMP68]], align 4 +; CLEANUP-CPS-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 6 +; CLEANUP-CPS-NEXT: store i32 [[TMP13]], ptr addrspace(32) [[TMP69]], align 4 +; CLEANUP-CPS-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 7 +; CLEANUP-CPS-NEXT: store i32 [[TMP15]], ptr addrspace(32) [[TMP70]], align 4 +; CLEANUP-CPS-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 8 +; CLEANUP-CPS-NEXT: store i32 [[TMP17]], ptr addrspace(32) [[TMP71]], align 4 +; CLEANUP-CPS-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 9 +; CLEANUP-CPS-NEXT: store i32 [[TMP19]], ptr addrspace(32) [[TMP72]], align 4 +; CLEANUP-CPS-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 10 +; CLEANUP-CPS-NEXT: store i32 [[TMP21]], ptr addrspace(32) [[TMP73]], align 4 +; CLEANUP-CPS-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 11 +; CLEANUP-CPS-NEXT: store i32 [[TMP23]], ptr addrspace(32) [[TMP74]], align 4 +; CLEANUP-CPS-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 12 +; CLEANUP-CPS-NEXT: store i32 [[TMP25]], ptr addrspace(32) [[TMP75]], align 4 +; CLEANUP-CPS-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 13 +; CLEANUP-CPS-NEXT: store i32 [[TMP27]], ptr addrspace(32) [[TMP76]], align 4 +; CLEANUP-CPS-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 14 +; CLEANUP-CPS-NEXT: store i32 [[TMP29]], ptr addrspace(32) [[TMP77]], align 4 +; CLEANUP-CPS-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 15 +; CLEANUP-CPS-NEXT: store i32 [[TMP31]], ptr addrspace(32) [[TMP78]], align 4 +; CLEANUP-CPS-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 16 +; CLEANUP-CPS-NEXT: store i32 [[TMP33]], ptr addrspace(32) [[TMP79]], align 4 +; CLEANUP-CPS-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 17 +; CLEANUP-CPS-NEXT: store i32 [[TMP35]], ptr addrspace(32) [[TMP80]], align 4 +; CLEANUP-CPS-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 18 +; CLEANUP-CPS-NEXT: store i32 [[TMP37]], ptr addrspace(32) [[TMP81]], align 4 +; CLEANUP-CPS-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 19 +; CLEANUP-CPS-NEXT: store i32 [[TMP39]], ptr addrspace(32) [[TMP82]], align 4 +; CLEANUP-CPS-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 20 +; CLEANUP-CPS-NEXT: store i32 [[TMP41]], ptr addrspace(32) [[TMP83]], align 4 +; CLEANUP-CPS-NEXT: [[TMP84:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 21 +; CLEANUP-CPS-NEXT: store i32 [[TMP43]], ptr addrspace(32) [[TMP84]], align 4 +; CLEANUP-CPS-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 22 +; CLEANUP-CPS-NEXT: store i32 [[TMP45]], ptr addrspace(32) [[TMP85]], align 4 +; CLEANUP-CPS-NEXT: [[TMP86:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 23 +; CLEANUP-CPS-NEXT: store i32 [[TMP47]], ptr addrspace(32) [[TMP86]], align 4 +; CLEANUP-CPS-NEXT: [[TMP87:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 24 +; CLEANUP-CPS-NEXT: store i32 [[TMP49]], ptr addrspace(32) [[TMP87]], align 4 +; CLEANUP-CPS-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 25 +; CLEANUP-CPS-NEXT: store i32 [[TMP51]], ptr addrspace(32) [[TMP88]], align 4 +; CLEANUP-CPS-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP63]], i32 26 +; CLEANUP-CPS-NEXT: store i32 [[TMP53]], ptr addrspace(32) [[TMP89]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT54:%.*]] = insertvalue [30 x i32] poison, i32 [[TMP62]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT57:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT54]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT60:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT57]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT63:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT60]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT66:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT63]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT69:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT66]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT72:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT69]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 +; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT75:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT72]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7 +; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT78:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT75]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8 +; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT81:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT78]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9 +; CLEANUP-CPS-NEXT: [[DOTFCA_10_INSERT84:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT81]], i32 [[PAYLOAD_FCA_10_EXTRACT]], 10 +; CLEANUP-CPS-NEXT: [[DOTFCA_11_INSERT87:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT84]], i32 [[PAYLOAD_FCA_11_EXTRACT]], 11 +; CLEANUP-CPS-NEXT: [[DOTFCA_12_INSERT90:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT87]], i32 [[PAYLOAD_FCA_12_EXTRACT]], 12 +; CLEANUP-CPS-NEXT: [[DOTFCA_13_INSERT93:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT90]], i32 [[PAYLOAD_FCA_13_EXTRACT]], 13 +; CLEANUP-CPS-NEXT: [[DOTFCA_14_INSERT96:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT93]], i32 [[PAYLOAD_FCA_14_EXTRACT]], 14 +; CLEANUP-CPS-NEXT: [[DOTFCA_15_INSERT99:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT96]], i32 [[PAYLOAD_FCA_15_EXTRACT]], 15 +; CLEANUP-CPS-NEXT: [[DOTFCA_16_INSERT102:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT99]], i32 [[PAYLOAD_FCA_16_EXTRACT]], 16 +; CLEANUP-CPS-NEXT: [[DOTFCA_17_INSERT105:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT102]], i32 [[PAYLOAD_FCA_17_EXTRACT]], 17 +; CLEANUP-CPS-NEXT: [[DOTFCA_18_INSERT108:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT105]], i32 [[PAYLOAD_FCA_18_EXTRACT]], 18 +; CLEANUP-CPS-NEXT: [[DOTFCA_19_INSERT111:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT108]], i32 [[PAYLOAD_FCA_19_EXTRACT]], 19 +; CLEANUP-CPS-NEXT: [[DOTFCA_20_INSERT114:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT111]], i32 [[PAYLOAD_FCA_20_EXTRACT]], 20 +; CLEANUP-CPS-NEXT: [[DOTFCA_21_INSERT117:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT114]], i32 [[PAYLOAD_FCA_21_EXTRACT]], 21 +; CLEANUP-CPS-NEXT: [[DOTFCA_22_INSERT120:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT117]], i32 [[PAYLOAD_FCA_22_EXTRACT]], 22 +; CLEANUP-CPS-NEXT: [[DOTFCA_23_INSERT123:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT120]], i32 [[PAYLOAD_FCA_23_EXTRACT]], 23 +; CLEANUP-CPS-NEXT: [[DOTFCA_24_INSERT126:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT123]], i32 [[PAYLOAD_FCA_24_EXTRACT]], 24 +; CLEANUP-CPS-NEXT: [[DOTFCA_25_INSERT129:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT126]], i32 [[PAYLOAD_FCA_25_EXTRACT]], 25 +; CLEANUP-CPS-NEXT: [[DOTFCA_26_INSERT132:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT129]], i32 [[PAYLOAD_FCA_26_EXTRACT]], 26 +; CLEANUP-CPS-NEXT: [[DOTFCA_27_INSERT135:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT132]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27 +; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT138:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT135]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 +; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT141:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT138]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP61]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]), !waitmask [[META22]], !continuation.returnedRegistercount [[META17]], !continuation.registercount [[META17]] +; CLEANUP-CPS-NEXT: unreachable +; +; +; CLEANUP-CPS-LABEL: define dso_local void @ClosestHit.resume.0( +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META26]] !lgc.cps [[META23]] !continuation [[META27]] { +; CLEANUP-CPS-NEXT: entryresume.0: +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 116) +; CLEANUP-CPS-NEXT: [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0 +; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 6 +; CLEANUP-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 7 +; CLEANUP-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 8 +; CLEANUP-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 9 +; CLEANUP-CPS-NEXT: [[DOTFCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 10 +; CLEANUP-CPS-NEXT: [[DOTFCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 11 +; CLEANUP-CPS-NEXT: [[DOTFCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 12 +; CLEANUP-CPS-NEXT: [[DOTFCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 13 +; CLEANUP-CPS-NEXT: [[DOTFCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 14 +; CLEANUP-CPS-NEXT: [[DOTFCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 15 +; CLEANUP-CPS-NEXT: [[DOTFCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 16 +; CLEANUP-CPS-NEXT: [[DOTFCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 17 +; CLEANUP-CPS-NEXT: [[DOTFCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 18 +; CLEANUP-CPS-NEXT: [[DOTFCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 19 +; CLEANUP-CPS-NEXT: [[DOTFCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 20 +; CLEANUP-CPS-NEXT: [[DOTFCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 21 +; CLEANUP-CPS-NEXT: [[DOTFCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 22 +; CLEANUP-CPS-NEXT: [[DOTFCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 23 +; CLEANUP-CPS-NEXT: [[DOTFCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 24 +; CLEANUP-CPS-NEXT: [[DOTFCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 25 +; CLEANUP-CPS-NEXT: [[DOTFCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 26 +; CLEANUP-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 27 +; CLEANUP-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 28 +; CLEANUP-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP5]], 29 +; CLEANUP-CPS-NEXT: [[TMP6:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4 +; CLEANUP-CPS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 1 +; CLEANUP-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4 +; CLEANUP-CPS-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 2 +; CLEANUP-CPS-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4 +; CLEANUP-CPS-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 3 +; CLEANUP-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP12]], align 4 +; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 4 +; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP14]], align 4 +; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 5 +; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(32) [[TMP16]], align 4 +; CLEANUP-CPS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 6 +; CLEANUP-CPS-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(32) [[TMP18]], align 4 +; CLEANUP-CPS-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 7 +; CLEANUP-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(32) [[TMP20]], align 4 +; CLEANUP-CPS-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 8 +; CLEANUP-CPS-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(32) [[TMP22]], align 4 +; CLEANUP-CPS-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 9 +; CLEANUP-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(32) [[TMP24]], align 4 +; CLEANUP-CPS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 10 +; CLEANUP-CPS-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4 +; CLEANUP-CPS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 11 +; CLEANUP-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4 +; CLEANUP-CPS-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 12 +; CLEANUP-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4 +; CLEANUP-CPS-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 13 +; CLEANUP-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4 +; CLEANUP-CPS-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 14 +; CLEANUP-CPS-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4 +; CLEANUP-CPS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 15 +; CLEANUP-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4 +; CLEANUP-CPS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 16 +; CLEANUP-CPS-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4 +; CLEANUP-CPS-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 17 +; CLEANUP-CPS-NEXT: [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4 +; CLEANUP-CPS-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 18 +; CLEANUP-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4 +; CLEANUP-CPS-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 19 +; CLEANUP-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4 +; CLEANUP-CPS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 20 +; CLEANUP-CPS-NEXT: [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4 +; CLEANUP-CPS-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 21 +; CLEANUP-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4 +; CLEANUP-CPS-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 22 +; CLEANUP-CPS-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4 +; CLEANUP-CPS-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 23 +; CLEANUP-CPS-NEXT: [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4 +; CLEANUP-CPS-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 24 +; CLEANUP-CPS-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4 +; CLEANUP-CPS-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 25 +; CLEANUP-CPS-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4 +; CLEANUP-CPS-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 26 +; CLEANUP-CPS-NEXT: [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4 +; CLEANUP-CPS-NEXT: [[TMP60:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: [[TMP61:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT254:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP61]], 0 +; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 2 +; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(32) [[PAYLOAD_FCA_0_EXTRACT_RELOAD_ADDR]], align 4 +; CLEANUP-CPS-NEXT: [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 1 +; CLEANUP-CPS-NEXT: [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4 +; CLEANUP-CPS-NEXT: [[TMP62:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]] to ptr addrspace(32) +; CLEANUP-CPS-NEXT: store i32 [[TMP7]], ptr addrspace(32) [[TMP62]], align 4 +; CLEANUP-CPS-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 1 +; CLEANUP-CPS-NEXT: store i32 [[TMP9]], ptr addrspace(32) [[TMP63]], align 4 +; CLEANUP-CPS-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 2 +; CLEANUP-CPS-NEXT: store i32 [[TMP11]], ptr addrspace(32) [[TMP64]], align 4 +; CLEANUP-CPS-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 3 +; CLEANUP-CPS-NEXT: store i32 [[TMP13]], ptr addrspace(32) [[TMP65]], align 4 +; CLEANUP-CPS-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 4 +; CLEANUP-CPS-NEXT: store i32 [[TMP15]], ptr addrspace(32) [[TMP66]], align 4 +; CLEANUP-CPS-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 5 +; CLEANUP-CPS-NEXT: store i32 [[TMP17]], ptr addrspace(32) [[TMP67]], align 4 +; CLEANUP-CPS-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 6 +; CLEANUP-CPS-NEXT: store i32 [[TMP19]], ptr addrspace(32) [[TMP68]], align 4 +; CLEANUP-CPS-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 7 +; CLEANUP-CPS-NEXT: store i32 [[TMP21]], ptr addrspace(32) [[TMP69]], align 4 +; CLEANUP-CPS-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 8 +; CLEANUP-CPS-NEXT: store i32 [[TMP23]], ptr addrspace(32) [[TMP70]], align 4 +; CLEANUP-CPS-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 9 +; CLEANUP-CPS-NEXT: store i32 [[TMP25]], ptr addrspace(32) [[TMP71]], align 4 +; CLEANUP-CPS-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 10 +; CLEANUP-CPS-NEXT: store i32 [[TMP27]], ptr addrspace(32) [[TMP72]], align 4 +; CLEANUP-CPS-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 11 +; CLEANUP-CPS-NEXT: store i32 [[TMP29]], ptr addrspace(32) [[TMP73]], align 4 +; CLEANUP-CPS-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 12 +; CLEANUP-CPS-NEXT: store i32 [[TMP31]], ptr addrspace(32) [[TMP74]], align 4 +; CLEANUP-CPS-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 13 +; CLEANUP-CPS-NEXT: store i32 [[TMP33]], ptr addrspace(32) [[TMP75]], align 4 +; CLEANUP-CPS-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 14 +; CLEANUP-CPS-NEXT: store i32 [[TMP35]], ptr addrspace(32) [[TMP76]], align 4 +; CLEANUP-CPS-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 15 +; CLEANUP-CPS-NEXT: store i32 [[TMP37]], ptr addrspace(32) [[TMP77]], align 4 +; CLEANUP-CPS-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 16 +; CLEANUP-CPS-NEXT: store i32 [[TMP39]], ptr addrspace(32) [[TMP78]], align 4 +; CLEANUP-CPS-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 17 +; CLEANUP-CPS-NEXT: store i32 [[TMP41]], ptr addrspace(32) [[TMP79]], align 4 +; CLEANUP-CPS-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 18 +; CLEANUP-CPS-NEXT: store i32 [[TMP43]], ptr addrspace(32) [[TMP80]], align 4 +; CLEANUP-CPS-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 19 +; CLEANUP-CPS-NEXT: store i32 [[TMP45]], ptr addrspace(32) [[TMP81]], align 4 +; CLEANUP-CPS-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 20 +; CLEANUP-CPS-NEXT: store i32 [[TMP47]], ptr addrspace(32) [[TMP82]], align 4 +; CLEANUP-CPS-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 21 +; CLEANUP-CPS-NEXT: store i32 [[TMP49]], ptr addrspace(32) [[TMP83]], align 4 +; CLEANUP-CPS-NEXT: [[TMP84:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 22 +; CLEANUP-CPS-NEXT: store i32 [[TMP51]], ptr addrspace(32) [[TMP84]], align 4 +; CLEANUP-CPS-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 23 +; CLEANUP-CPS-NEXT: store i32 [[TMP53]], ptr addrspace(32) [[TMP85]], align 4 +; CLEANUP-CPS-NEXT: [[TMP86:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 24 +; CLEANUP-CPS-NEXT: store i32 [[TMP55]], ptr addrspace(32) [[TMP86]], align 4 +; CLEANUP-CPS-NEXT: [[TMP87:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 25 +; CLEANUP-CPS-NEXT: store i32 [[TMP57]], ptr addrspace(32) [[TMP87]], align 4 +; CLEANUP-CPS-NEXT: [[TMP88:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP62]], i32 26 +; CLEANUP-CPS-NEXT: store i32 [[TMP59]], ptr addrspace(32) [[TMP88]], align 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT253:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT254]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0 +; CLEANUP-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1 +; CLEANUP-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 [[DOTFCA_2_EXTRACT]], 2 +; CLEANUP-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 [[DOTFCA_3_EXTRACT]], 3 +; CLEANUP-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 [[DOTFCA_4_EXTRACT]], 4 +; CLEANUP-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 [[DOTFCA_5_EXTRACT]], 5 +; CLEANUP-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 [[DOTFCA_6_EXTRACT]], 6 +; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 [[DOTFCA_7_EXTRACT]], 7 +; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 [[DOTFCA_8_EXTRACT]], 8 +; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 [[DOTFCA_9_EXTRACT]], 9 +; CLEANUP-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 [[DOTFCA_10_EXTRACT]], 10 +; CLEANUP-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 [[DOTFCA_11_EXTRACT]], 11 +; CLEANUP-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 [[DOTFCA_12_EXTRACT]], 12 +; CLEANUP-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 [[DOTFCA_13_EXTRACT]], 13 +; CLEANUP-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 [[DOTFCA_14_EXTRACT]], 14 +; CLEANUP-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 [[DOTFCA_15_EXTRACT]], 15 +; CLEANUP-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 [[DOTFCA_16_EXTRACT]], 16 +; CLEANUP-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 [[DOTFCA_17_EXTRACT]], 17 +; CLEANUP-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 [[DOTFCA_18_EXTRACT]], 18 +; CLEANUP-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 [[DOTFCA_19_EXTRACT]], 19 +; CLEANUP-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 [[DOTFCA_20_EXTRACT]], 20 +; CLEANUP-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 [[DOTFCA_21_EXTRACT]], 21 +; CLEANUP-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 [[DOTFCA_22_EXTRACT]], 22 +; CLEANUP-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 [[DOTFCA_23_EXTRACT]], 23 +; CLEANUP-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 [[DOTFCA_24_EXTRACT]], 24 +; CLEANUP-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 [[DOTFCA_25_EXTRACT]], 25 +; CLEANUP-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 [[DOTFCA_26_EXTRACT]], 26 +; CLEANUP-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[DOTFCA_27_EXTRACT]], 27 +; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28 +; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 +; CLEANUP-CPS-NEXT: call void @lgc.cps.free(i32 116) +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 [[RETURNADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT253]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META17]] +; CLEANUP-CPS-NEXT: unreachable +; +; +; POST-PROCESS-CPS-LABEL: define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes( +; POST-PROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; POST-PROCESS-CPS-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 +; POST-PROCESS-CPS-NEXT: [[VAL:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], ptr [[ADDR]], align 4 +; POST-PROCESS-CPS-NEXT: ret [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]] +; +; +; POST-PROCESS-CPS-LABEL: define i32 @_cont_GetLocalRootIndex( +; POST-PROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; POST-PROCESS-CPS-NEXT: ret i32 5 +; +; +; POST-PROCESS-CPS-LABEL: define void @main( +; POST-PROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] { +; POST-PROCESS-CPS-NEXT: AllocaSpillBB: +; POST-PROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 108 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP2]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT56:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 +; POST-PROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) +; POST-PROCESS-CPS-NEXT: [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 +; POST-PROCESS-CPS-NEXT: [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 +; POST-PROCESS-CPS-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; POST-PROCESS-CPS-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-CPS-NEXT: [[TMP7:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP6]]) +; POST-PROCESS-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT56]], 0 +; POST-PROCESS-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 +; POST-PROCESS-CPS-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 +; POST-PROCESS-CPS-NEXT: [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0) +; POST-PROCESS-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP8]], 5 +; POST-PROCESS-CPS-NEXT: [[TMP9:%.*]] = inttoptr i32 [[TMP1]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP10]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP11:%.*]] = add i32 [[TMP1]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP11]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP12]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP13]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP14:%.*]] = add i32 [[TMP1]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP15]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP16]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP17:%.*]] = add i32 [[TMP1]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP17]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP18]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP19]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP20:%.*]] = add i32 [[TMP1]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP20]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP21]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP22]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP23:%.*]] = add i32 [[TMP1]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP23]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP24]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP25]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP26:%.*]] = add i32 [[TMP1]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP28]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP29:%.*]] = add i32 [[TMP1]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP29]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP30]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP31]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP32:%.*]] = add i32 [[TMP1]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP33:%.*]] = inttoptr i32 [[TMP32]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP33]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP34]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP35:%.*]] = add i32 [[TMP1]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP36:%.*]] = inttoptr i32 [[TMP35]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP37]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP38:%.*]] = add i32 [[TMP1]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP40]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP41:%.*]] = add i32 [[TMP1]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP42:%.*]] = inttoptr i32 [[TMP41]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP42]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP43]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP44:%.*]] = add i32 [[TMP1]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP45:%.*]] = inttoptr i32 [[TMP44]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP45]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP46]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP47:%.*]] = add i32 [[TMP1]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP48:%.*]] = inttoptr i32 [[TMP47]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP49]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP50:%.*]] = add i32 [[TMP1]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP52]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP53:%.*]] = add i32 [[TMP1]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP54:%.*]] = inttoptr i32 [[TMP53]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP54]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP55]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP56:%.*]] = add i32 [[TMP1]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP57:%.*]] = inttoptr i32 [[TMP56]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP57]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP58]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP59:%.*]] = add i32 [[TMP1]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP60:%.*]] = inttoptr i32 [[TMP59]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP61]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP62:%.*]] = add i32 [[TMP1]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP64]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP65:%.*]] = add i32 [[TMP1]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP66:%.*]] = inttoptr i32 [[TMP65]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP66]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP67]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP68:%.*]] = add i32 [[TMP1]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP69:%.*]] = inttoptr i32 [[TMP68]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP70:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP69]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP70]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP71:%.*]] = add i32 [[TMP1]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP72:%.*]] = inttoptr i32 [[TMP71]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP73:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP73]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP74:%.*]] = add i32 [[TMP1]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP76]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP77:%.*]] = add i32 [[TMP1]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP78:%.*]] = inttoptr i32 [[TMP77]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP79:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP78]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP79]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP80:%.*]] = add i32 [[TMP1]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP81:%.*]] = inttoptr i32 [[TMP80]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP81]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP82]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP83:%.*]] = add i32 [[TMP1]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP84:%.*]] = inttoptr i32 [[TMP83]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP85:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP85]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP86:%.*]] = add i32 [[TMP1]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 undef, ptr addrspace(21) [[TMP88]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[TMP1]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 undef, 7 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 undef, 8 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 undef, 9 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 undef, 10 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 undef, 11 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 undef, 12 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 undef, 13 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 undef, 14 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 undef, 15 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 undef, 16 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 undef, 17 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 undef, 18 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 undef, 19 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 undef, 20 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 undef, 21 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 undef, 22 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 undef, 23 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 undef, 24 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 undef, 25 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 undef, 26 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 undef, 27 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 undef, 28 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 undef, 29 +; POST-PROCESS-CPS-NEXT: [[TMP89:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 4, i64 -1, i32 [[TMP89]], i64 [[TMP8]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POST-PROCESS-CPS-NEXT: unreachable +; +; +; POST-PROCESS-CPS-LABEL: define dso_local void @main.resume.0( +; POST-PROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META19]] !continuation [[META20]] { +; POST-PROCESS-CPS-NEXT: entryresume.0: +; POST-PROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], -108 +; POST-PROCESS-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 6 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 7 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 8 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 9 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 10 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 11 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 12 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 13 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 14 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 15 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 16 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 17 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 18 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 19 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 20 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 21 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 22 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 23 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 24 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 25 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 26 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29 +; POST-PROCESS-CPS-NEXT: [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP7]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(21) [[TMP8]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP10:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP11]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(21) [[TMP12]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP15]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(21) [[TMP16]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP18:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP18]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP19]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(21) [[TMP20]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP22:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP22]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP23]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(21) [[TMP24]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP30:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP34:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP35:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP35]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP36]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP42:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP46:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP47:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP47]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP54:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP55:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP58:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP59:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP59]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP66:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP67:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP70:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP71:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP71]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP78:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP79:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP82:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP83:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP83]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP90:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP91:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP91]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP94:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP95:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP95]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP98:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP99:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP99]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP102:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP103:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP103]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP106:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP107:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP107]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP110:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP111:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP112:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP111]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP112]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP114:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP114]], 0 +; POST-PROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) +; POST-PROCESS-CPS-NEXT: [[TMP115:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP116:%.*]] = add i32 [[TMP115]], -108 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP116]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: ret void +; +; +; POST-PROCESS-CPS-LABEL: define void @AnyHit( +; POST-PROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META23:![0-9]+]] !continuation [[META24:![0-9]+]] { +; POST-PROCESS-CPS-NEXT: AllocaSpillBB: +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 +; POST-PROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 0, 0 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 +; POST-PROCESS-CPS-NEXT: store <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_0_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 1, 0 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-CPS-NEXT: store <2 x float> [[SYSTEM_DATA_FCA_0_0_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_1_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 1, 0 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-CPS-NEXT: store float [[SYSTEM_DATA_FCA_0_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_1_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 1, 1 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 1 +; POST-PROCESS-CPS-NEXT: store i32 [[SYSTEM_DATA_FCA_0_1_1_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_1_1_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 2 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_2_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 2 +; POST-PROCESS-CPS-NEXT: store <3 x float> [[SYSTEM_DATA_FCA_0_2_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_2_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 3 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 3 +; POST-PROCESS-CPS-NEXT: store <3 x float> [[SYSTEM_DATA_FCA_0_3_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_3_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_4_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 4 +; POST-PROCESS-CPS-NEXT: store float [[SYSTEM_DATA_FCA_0_4_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_4_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 5 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_5_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 5 +; POST-PROCESS-CPS-NEXT: store i64 [[SYSTEM_DATA_FCA_0_5_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_5_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 1, 0 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0 +; POST-PROCESS-CPS-NEXT: store float [[SYSTEM_DATA_FCA_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_1_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 1, 1 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1 +; POST-PROCESS-CPS-NEXT: store i32 [[SYSTEM_DATA_FCA_1_1_EXTRACT]], ptr [[SYSTEM_DATA_FCA_1_1_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP0:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP0]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(21) [[TMP1]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP3:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(21) [[TMP5]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP7:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP8:%.*]] = inttoptr i32 [[TMP7]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP8]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(21) [[TMP9]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP11:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP11]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP12]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(21) [[TMP13]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP15:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP16:%.*]] = inttoptr i32 [[TMP15]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP16]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(21) [[TMP17]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP19:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP19]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP20]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(21) [[TMP21]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP23:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP23]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP24]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(21) [[TMP25]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP27:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP27]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP28]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP30:%.*]] = load i32, ptr addrspace(21) [[TMP29]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP31:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP32:%.*]] = inttoptr i32 [[TMP31]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP32]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP34:%.*]] = load i32, ptr addrspace(21) [[TMP33]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP35:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP36:%.*]] = inttoptr i32 [[TMP35]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(21) [[TMP37]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP39:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP40:%.*]] = inttoptr i32 [[TMP39]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP42:%.*]] = load i32, ptr addrspace(21) [[TMP41]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP43:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP44:%.*]] = inttoptr i32 [[TMP43]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP44]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP46:%.*]] = load i32, ptr addrspace(21) [[TMP45]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP47:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP48:%.*]] = inttoptr i32 [[TMP47]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP50:%.*]] = load i32, ptr addrspace(21) [[TMP49]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP51:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP52:%.*]] = inttoptr i32 [[TMP51]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP54:%.*]] = load i32, ptr addrspace(21) [[TMP53]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP55:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP56:%.*]] = inttoptr i32 [[TMP55]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP56]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP58:%.*]] = load i32, ptr addrspace(21) [[TMP57]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP59:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP60:%.*]] = inttoptr i32 [[TMP59]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP62:%.*]] = load i32, ptr addrspace(21) [[TMP61]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP63:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP64:%.*]] = inttoptr i32 [[TMP63]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP66:%.*]] = load i32, ptr addrspace(21) [[TMP65]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP67:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP68:%.*]] = inttoptr i32 [[TMP67]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP69:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP68]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP70:%.*]] = load i32, ptr addrspace(21) [[TMP69]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP71:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP72:%.*]] = inttoptr i32 [[TMP71]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP73:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP74:%.*]] = load i32, ptr addrspace(21) [[TMP73]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP75:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP76:%.*]] = inttoptr i32 [[TMP75]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP77:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP78:%.*]] = load i32, ptr addrspace(21) [[TMP77]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP79:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP80:%.*]] = inttoptr i32 [[TMP79]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP81:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP80]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP82:%.*]] = load i32, ptr addrspace(21) [[TMP81]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP83:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP84:%.*]] = inttoptr i32 [[TMP83]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP85:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP86:%.*]] = load i32, ptr addrspace(21) [[TMP85]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP87:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP88:%.*]] = inttoptr i32 [[TMP87]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP89:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP90:%.*]] = load i32, ptr addrspace(21) [[TMP89]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP91:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP92:%.*]] = inttoptr i32 [[TMP91]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP92]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP94:%.*]] = load i32, ptr addrspace(21) [[TMP93]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP95:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP96:%.*]] = inttoptr i32 [[TMP95]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP97:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP96]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP98:%.*]] = load i32, ptr addrspace(21) [[TMP97]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP99:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP100:%.*]] = inttoptr i32 [[TMP99]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP101:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP100]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP102:%.*]] = load i32, ptr addrspace(21) [[TMP101]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP103:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP104:%.*]] = inttoptr i32 [[TMP103]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP105:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP104]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP106:%.*]] = load i32, ptr addrspace(21) [[TMP105]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 +; POST-PROCESS-CPS-NEXT: [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP107]], i32 0, i32 1 +; POST-PROCESS-CPS-NEXT: [[VAL_I_FCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[ADDR_I]], i32 0, i32 0 +; POST-PROCESS-CPS-NEXT: [[VAL_I_FCA_0_LOAD:%.*]] = load <2 x float>, ptr [[VAL_I_FCA_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[VAL_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[VAL_I_FCA_0_LOAD]], 0 +; POST-PROCESS-CPS-NEXT: [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I_FCA_0_INSERT]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTSROA_025_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP108:%.*]] = bitcast float [[DOTSROA_025_0_VEC_EXTRACT]] to i32 +; POST-PROCESS-CPS-NEXT: [[DOTSROA_025_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 1 +; POST-PROCESS-CPS-NEXT: [[TMP109:%.*]] = bitcast float [[DOTSROA_025_4_VEC_EXTRACT]] to i32 +; POST-PROCESS-CPS-NEXT: [[HIT_ATTRS_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[HIT_ATTRS]], 0 +; POST-PROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) +; POST-PROCESS-CPS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) +; POST-PROCESS-CPS-NEXT: [[TMP110:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP111:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP110]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP2]], ptr addrspace(21) [[TMP111]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP112:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP113:%.*]] = inttoptr i32 [[TMP112]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP114:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP113]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP6]], ptr addrspace(21) [[TMP114]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP115:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP116:%.*]] = inttoptr i32 [[TMP115]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP117:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP116]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP10]], ptr addrspace(21) [[TMP117]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP118:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP119:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP120:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP119]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP14]], ptr addrspace(21) [[TMP120]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP121:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP122:%.*]] = inttoptr i32 [[TMP121]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP123:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP122]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP18]], ptr addrspace(21) [[TMP123]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP124:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP125:%.*]] = inttoptr i32 [[TMP124]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP126:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP125]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP22]], ptr addrspace(21) [[TMP126]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP127:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP128:%.*]] = inttoptr i32 [[TMP127]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP129:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP128]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP26]], ptr addrspace(21) [[TMP129]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP130:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP131:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP132:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP131]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP30]], ptr addrspace(21) [[TMP132]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP133:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP134:%.*]] = inttoptr i32 [[TMP133]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP135:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP134]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP34]], ptr addrspace(21) [[TMP135]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP136:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP137:%.*]] = inttoptr i32 [[TMP136]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP138:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP137]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP38]], ptr addrspace(21) [[TMP138]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP139:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP140:%.*]] = inttoptr i32 [[TMP139]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP141:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP140]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP42]], ptr addrspace(21) [[TMP141]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP142:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP143:%.*]] = inttoptr i32 [[TMP142]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP143]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP46]], ptr addrspace(21) [[TMP144]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP145:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP146:%.*]] = inttoptr i32 [[TMP145]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP147:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP146]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP50]], ptr addrspace(21) [[TMP147]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP148:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP149:%.*]] = inttoptr i32 [[TMP148]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP150:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP149]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP54]], ptr addrspace(21) [[TMP150]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP151:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP152:%.*]] = inttoptr i32 [[TMP151]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP153:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP152]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP58]], ptr addrspace(21) [[TMP153]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP154:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP155:%.*]] = inttoptr i32 [[TMP154]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP156:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP155]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP62]], ptr addrspace(21) [[TMP156]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP157:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP158:%.*]] = inttoptr i32 [[TMP157]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP159:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP158]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP66]], ptr addrspace(21) [[TMP159]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP160:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP161:%.*]] = inttoptr i32 [[TMP160]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP162:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP161]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP70]], ptr addrspace(21) [[TMP162]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP163:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP164:%.*]] = inttoptr i32 [[TMP163]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP165:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP164]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP74]], ptr addrspace(21) [[TMP165]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP166:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP167:%.*]] = inttoptr i32 [[TMP166]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP168:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP167]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP78]], ptr addrspace(21) [[TMP168]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP169:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP170:%.*]] = inttoptr i32 [[TMP169]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP171:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP170]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP82]], ptr addrspace(21) [[TMP171]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP172:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP173:%.*]] = inttoptr i32 [[TMP172]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP174:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP173]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP86]], ptr addrspace(21) [[TMP174]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP175:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP176:%.*]] = inttoptr i32 [[TMP175]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP177:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP176]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP90]], ptr addrspace(21) [[TMP177]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP178:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP179:%.*]] = inttoptr i32 [[TMP178]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP180:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP179]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP94]], ptr addrspace(21) [[TMP180]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP181:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP182:%.*]] = inttoptr i32 [[TMP181]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP183:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP182]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP98]], ptr addrspace(21) [[TMP183]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP184:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP185:%.*]] = inttoptr i32 [[TMP184]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP186:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP185]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP102]], ptr addrspace(21) [[TMP186]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP187:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP188:%.*]] = inttoptr i32 [[TMP187]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP189:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP188]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP106]], ptr addrspace(21) [[TMP189]], align 4 +; POST-PROCESS-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP190:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32 +; POST-PROCESS-CPS-NEXT: [[TMP191:%.*]] = bitcast i32 [[TMP190]] to float +; POST-PROCESS-CPS-NEXT: [[DOTSROA_027_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP191]], i32 0 +; POST-PROCESS-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1 +; POST-PROCESS-CPS-NEXT: [[TMP192:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32 +; POST-PROCESS-CPS-NEXT: [[TMP193:%.*]] = bitcast i32 [[TMP192]] to float +; POST-PROCESS-CPS-NEXT: [[DOTSROA_027_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_027_0_VEC_INSERT]], float [[TMP193]], i32 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_INSERT26:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_027_4_VEC_INSERT]], 0 +; POST-PROCESS-CPS-NEXT: [[TMP194:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 +; POST-PROCESS-CPS-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP194]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT26]]) +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_0_0_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_0_0_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_0_LOAD]], 0, 0, 0, 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_0_1_0_LOAD:%.*]] = load <2 x float>, ptr [[DOTFCA_0_0_1_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_0_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_0_0_0_INSERT]], <2 x float> [[DOTFCA_0_0_1_0_LOAD]], 0, 0, 1, 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_1_0_LOAD:%.*]] = load float, ptr [[DOTFCA_0_1_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_0_1_0_INSERT]], float [[DOTFCA_0_1_0_LOAD]], 0, 1, 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_0_1_1_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_1_0_INSERT]], i32 [[DOTFCA_0_1_1_LOAD]], 0, 1, 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_2_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_2_LOAD:%.*]] = load <3 x float>, ptr [[DOTFCA_0_2_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_1_1_INSERT]], <3 x float> [[DOTFCA_0_2_LOAD]], 0, 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_3_LOAD:%.*]] = load <3 x float>, ptr [[DOTFCA_0_3_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_2_INSERT]], <3 x float> [[DOTFCA_0_3_LOAD]], 0, 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_4_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_4_LOAD:%.*]] = load float, ptr [[DOTFCA_0_4_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_4_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_3_INSERT]], float [[DOTFCA_0_4_LOAD]], 0, 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_5_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_5_LOAD:%.*]] = load i64, ptr [[DOTFCA_0_5_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_5_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_4_INSERT]], i64 [[DOTFCA_0_5_LOAD]], 0, 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_0_LOAD:%.*]] = load float, ptr [[DOTFCA_1_0_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_5_INSERT]], float [[DOTFCA_1_0_LOAD]], 1, 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 [[PAYLOAD_FCA_10_EXTRACT]], 10 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 [[PAYLOAD_FCA_11_EXTRACT]], 11 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 [[PAYLOAD_FCA_12_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 [[PAYLOAD_FCA_13_EXTRACT]], 13 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 [[PAYLOAD_FCA_14_EXTRACT]], 14 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 [[PAYLOAD_FCA_15_EXTRACT]], 15 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 [[PAYLOAD_FCA_16_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 [[PAYLOAD_FCA_17_EXTRACT]], 17 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 [[PAYLOAD_FCA_18_EXTRACT]], 18 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 [[PAYLOAD_FCA_19_EXTRACT]], 19 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 [[PAYLOAD_FCA_20_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 [[PAYLOAD_FCA_21_EXTRACT]], 21 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 [[PAYLOAD_FCA_22_EXTRACT]], 22 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 [[PAYLOAD_FCA_23_EXTRACT]], 23 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 [[PAYLOAD_FCA_24_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 [[PAYLOAD_FCA_25_EXTRACT]], 25 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 [[PAYLOAD_FCA_26_EXTRACT]], 26 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 +; POST-PROCESS-CPS-NEXT: [[TMP195:%.*]] = zext i32 [[RETURNADDR]] to i64 +; POST-PROCESS-CPS-NEXT: [[TMP196:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP195]], i32 [[TMP196]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POST-PROCESS-CPS-NEXT: unreachable +; +; +; POST-PROCESS-CPS-LABEL: define void @ClosestHit( +; POST-PROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !lgc.cps [[META22]] !continuation [[META26:![0-9]+]] !continuation.stacksize [[META21]] { +; POST-PROCESS-CPS-NEXT: AllocaSpillBB: +; POST-PROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP0:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 116 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP1]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 108 +; POST-PROCESS-CPS-NEXT: [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[RETURNADDR]], ptr addrspace(21) [[TMP4]], align 4 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0 +; POST-PROCESS-CPS-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 112 +; POST-PROCESS-CPS-NEXT: [[TMP6:%.*]] = inttoptr i32 [[TMP5]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP6]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[PAYLOAD_FCA_0_EXTRACT]], ptr addrspace(21) [[TMP7]], align 4 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 +; POST-PROCESS-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 1, 0 +; POST-PROCESS-CPS-NEXT: [[TMP8:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP8]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(21) [[TMP9]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP11:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP11]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP12]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(21) [[TMP13]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP15:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP16:%.*]] = inttoptr i32 [[TMP15]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP16]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(21) [[TMP17]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP19:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP19]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP20]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(21) [[TMP21]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP23:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP23]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP24]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(21) [[TMP25]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP27:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP27]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP28]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP30:%.*]] = load i32, ptr addrspace(21) [[TMP29]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP31:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP32:%.*]] = inttoptr i32 [[TMP31]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP32]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP34:%.*]] = load i32, ptr addrspace(21) [[TMP33]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP35:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP36:%.*]] = inttoptr i32 [[TMP35]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(21) [[TMP37]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP39:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP40:%.*]] = inttoptr i32 [[TMP39]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP42:%.*]] = load i32, ptr addrspace(21) [[TMP41]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP43:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP44:%.*]] = inttoptr i32 [[TMP43]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP44]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP46:%.*]] = load i32, ptr addrspace(21) [[TMP45]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP47:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP48:%.*]] = inttoptr i32 [[TMP47]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP50:%.*]] = load i32, ptr addrspace(21) [[TMP49]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP51:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP52:%.*]] = inttoptr i32 [[TMP51]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP54:%.*]] = load i32, ptr addrspace(21) [[TMP53]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP55:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP56:%.*]] = inttoptr i32 [[TMP55]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP56]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP58:%.*]] = load i32, ptr addrspace(21) [[TMP57]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP59:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP60:%.*]] = inttoptr i32 [[TMP59]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP62:%.*]] = load i32, ptr addrspace(21) [[TMP61]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP63:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP64:%.*]] = inttoptr i32 [[TMP63]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP66:%.*]] = load i32, ptr addrspace(21) [[TMP65]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP67:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP68:%.*]] = inttoptr i32 [[TMP67]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP69:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP68]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP70:%.*]] = load i32, ptr addrspace(21) [[TMP69]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP71:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP72:%.*]] = inttoptr i32 [[TMP71]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP73:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP74:%.*]] = load i32, ptr addrspace(21) [[TMP73]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP75:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP76:%.*]] = inttoptr i32 [[TMP75]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP77:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP78:%.*]] = load i32, ptr addrspace(21) [[TMP77]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP79:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP80:%.*]] = inttoptr i32 [[TMP79]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP81:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP80]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP82:%.*]] = load i32, ptr addrspace(21) [[TMP81]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP83:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP84:%.*]] = inttoptr i32 [[TMP83]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP85:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP86:%.*]] = load i32, ptr addrspace(21) [[TMP85]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP87:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP88:%.*]] = inttoptr i32 [[TMP87]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP89:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP90:%.*]] = load i32, ptr addrspace(21) [[TMP89]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP91:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP92:%.*]] = inttoptr i32 [[TMP91]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP92]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP94:%.*]] = load i32, ptr addrspace(21) [[TMP93]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP95:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP96:%.*]] = inttoptr i32 [[TMP95]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP97:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP96]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP98:%.*]] = load i32, ptr addrspace(21) [[TMP97]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP99:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP100:%.*]] = inttoptr i32 [[TMP99]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP101:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP100]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP102:%.*]] = load i32, ptr addrspace(21) [[TMP101]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP103:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP104:%.*]] = inttoptr i32 [[TMP103]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP105:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP104]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP106:%.*]] = load i32, ptr addrspace(21) [[TMP105]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP107:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP108:%.*]] = inttoptr i32 [[TMP107]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP109:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP108]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP110:%.*]] = load i32, ptr addrspace(21) [[TMP109]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP111:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP112:%.*]] = inttoptr i32 [[TMP111]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP113:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP112]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP114:%.*]] = load i32, ptr addrspace(21) [[TMP113]], align 4 +; POST-PROCESS-CPS-NEXT: [[VAL_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> [[SYSTEM_DATA_FCA_1_0_EXTRACT]], 0 +; POST-PROCESS-CPS-NEXT: [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I_FCA_0_INSERT]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTSROA_0257_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP115:%.*]] = bitcast float [[DOTSROA_0257_0_VEC_EXTRACT]] to i32 +; POST-PROCESS-CPS-NEXT: [[DOTSROA_0257_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 1 +; POST-PROCESS-CPS-NEXT: [[TMP116:%.*]] = bitcast float [[DOTSROA_0257_4_VEC_EXTRACT]] to i32 +; POST-PROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) +; POST-PROCESS-CPS-NEXT: [[TMP117:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 +; POST-PROCESS-CPS-NEXT: [[TMP118:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 +; POST-PROCESS-CPS-NEXT: [[TMP119:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP117]]) +; POST-PROCESS-CPS-NEXT: [[TMP120:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP119]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-CPS-NEXT: [[TMP121:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP120]]) +; POST-PROCESS-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0 +; POST-PROCESS-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 +; POST-PROCESS-CPS-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 +; POST-PROCESS-CPS-NEXT: [[TMP122:%.*]] = call i64 @continuation.getAddrAndMD(ptr @ClosestHit.resume.0) +; POST-PROCESS-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP122]], 5 +; POST-PROCESS-CPS-NEXT: [[TMP123:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP124:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP123]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP10]], ptr addrspace(21) [[TMP124]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP125:%.*]] = add i32 [[TMP0]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP126:%.*]] = inttoptr i32 [[TMP125]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP127:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP126]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP14]], ptr addrspace(21) [[TMP127]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP128:%.*]] = add i32 [[TMP0]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP129:%.*]] = inttoptr i32 [[TMP128]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP130:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP129]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP18]], ptr addrspace(21) [[TMP130]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP131:%.*]] = add i32 [[TMP0]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP132:%.*]] = inttoptr i32 [[TMP131]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP133:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP132]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP22]], ptr addrspace(21) [[TMP133]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP134:%.*]] = add i32 [[TMP0]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP135:%.*]] = inttoptr i32 [[TMP134]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP136:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP135]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP26]], ptr addrspace(21) [[TMP136]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP137:%.*]] = add i32 [[TMP0]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP138:%.*]] = inttoptr i32 [[TMP137]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP139:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP138]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP30]], ptr addrspace(21) [[TMP139]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP140:%.*]] = add i32 [[TMP0]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP141:%.*]] = inttoptr i32 [[TMP140]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP142:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP141]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP34]], ptr addrspace(21) [[TMP142]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP143:%.*]] = add i32 [[TMP0]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP144:%.*]] = inttoptr i32 [[TMP143]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP145:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP144]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP38]], ptr addrspace(21) [[TMP145]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP146:%.*]] = add i32 [[TMP0]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP147:%.*]] = inttoptr i32 [[TMP146]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP148:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP147]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP42]], ptr addrspace(21) [[TMP148]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP149:%.*]] = add i32 [[TMP0]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP150:%.*]] = inttoptr i32 [[TMP149]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP151:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP150]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP46]], ptr addrspace(21) [[TMP151]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP152:%.*]] = add i32 [[TMP0]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP153:%.*]] = inttoptr i32 [[TMP152]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP154:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP153]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP50]], ptr addrspace(21) [[TMP154]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP155:%.*]] = add i32 [[TMP0]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP156:%.*]] = inttoptr i32 [[TMP155]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP157:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP156]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP54]], ptr addrspace(21) [[TMP157]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP158:%.*]] = add i32 [[TMP0]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP159:%.*]] = inttoptr i32 [[TMP158]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP160:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP159]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP58]], ptr addrspace(21) [[TMP160]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP161:%.*]] = add i32 [[TMP0]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP162:%.*]] = inttoptr i32 [[TMP161]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP163:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP162]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP62]], ptr addrspace(21) [[TMP163]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP164:%.*]] = add i32 [[TMP0]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP165:%.*]] = inttoptr i32 [[TMP164]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP166:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP165]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP66]], ptr addrspace(21) [[TMP166]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP167:%.*]] = add i32 [[TMP0]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP168:%.*]] = inttoptr i32 [[TMP167]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP169:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP168]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP70]], ptr addrspace(21) [[TMP169]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP170:%.*]] = add i32 [[TMP0]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP171:%.*]] = inttoptr i32 [[TMP170]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP172:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP171]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP74]], ptr addrspace(21) [[TMP172]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP173:%.*]] = add i32 [[TMP0]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP174:%.*]] = inttoptr i32 [[TMP173]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP175:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP174]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP78]], ptr addrspace(21) [[TMP175]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP176:%.*]] = add i32 [[TMP0]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP177:%.*]] = inttoptr i32 [[TMP176]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP178:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP177]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP82]], ptr addrspace(21) [[TMP178]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP179:%.*]] = add i32 [[TMP0]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP180:%.*]] = inttoptr i32 [[TMP179]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP181:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP180]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP86]], ptr addrspace(21) [[TMP181]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP182:%.*]] = add i32 [[TMP0]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP183:%.*]] = inttoptr i32 [[TMP182]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP184:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP183]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP90]], ptr addrspace(21) [[TMP184]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP185:%.*]] = add i32 [[TMP0]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP186:%.*]] = inttoptr i32 [[TMP185]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP187:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP186]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP94]], ptr addrspace(21) [[TMP187]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP188:%.*]] = add i32 [[TMP0]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP189:%.*]] = inttoptr i32 [[TMP188]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP190:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP189]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP98]], ptr addrspace(21) [[TMP190]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP191:%.*]] = add i32 [[TMP0]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP192:%.*]] = inttoptr i32 [[TMP191]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP193:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP192]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP102]], ptr addrspace(21) [[TMP193]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP194:%.*]] = add i32 [[TMP0]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP195:%.*]] = inttoptr i32 [[TMP194]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP196:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP195]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP106]], ptr addrspace(21) [[TMP196]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP197:%.*]] = add i32 [[TMP0]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP198:%.*]] = inttoptr i32 [[TMP197]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP199:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP198]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP110]], ptr addrspace(21) [[TMP199]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP200:%.*]] = add i32 [[TMP0]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP201:%.*]] = inttoptr i32 [[TMP200]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP202:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP201]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP114]], ptr addrspace(21) [[TMP202]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_INSERT54:%.*]] = insertvalue [30 x i32] poison, i32 [[TMP0]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_INSERT57:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT54]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_2_INSERT60:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT57]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_3_INSERT63:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT60]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_4_INSERT66:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT63]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_5_INSERT69:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT66]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_6_INSERT72:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT69]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_7_INSERT75:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT72]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_8_INSERT78:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT75]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_9_INSERT81:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT78]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_10_INSERT84:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT81]], i32 [[PAYLOAD_FCA_10_EXTRACT]], 10 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_11_INSERT87:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT84]], i32 [[PAYLOAD_FCA_11_EXTRACT]], 11 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_12_INSERT90:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT87]], i32 [[PAYLOAD_FCA_12_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_13_INSERT93:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT90]], i32 [[PAYLOAD_FCA_13_EXTRACT]], 13 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_14_INSERT96:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT93]], i32 [[PAYLOAD_FCA_14_EXTRACT]], 14 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_15_INSERT99:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT96]], i32 [[PAYLOAD_FCA_15_EXTRACT]], 15 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_16_INSERT102:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT99]], i32 [[PAYLOAD_FCA_16_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_17_INSERT105:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT102]], i32 [[PAYLOAD_FCA_17_EXTRACT]], 17 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_18_INSERT108:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT105]], i32 [[PAYLOAD_FCA_18_EXTRACT]], 18 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_19_INSERT111:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT108]], i32 [[PAYLOAD_FCA_19_EXTRACT]], 19 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_20_INSERT114:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT111]], i32 [[PAYLOAD_FCA_20_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_21_INSERT117:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT114]], i32 [[PAYLOAD_FCA_21_EXTRACT]], 21 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_22_INSERT120:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT117]], i32 [[PAYLOAD_FCA_22_EXTRACT]], 22 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_23_INSERT123:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT120]], i32 [[PAYLOAD_FCA_23_EXTRACT]], 23 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_24_INSERT126:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT123]], i32 [[PAYLOAD_FCA_24_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_25_INSERT129:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT126]], i32 [[PAYLOAD_FCA_25_EXTRACT]], 25 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_26_INSERT132:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT129]], i32 [[PAYLOAD_FCA_26_EXTRACT]], 26 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_27_INSERT135:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT132]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_28_INSERT138:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT135]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_29_INSERT141:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT138]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 +; POST-PROCESS-CPS-NEXT: [[TMP203:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 4, i64 -1, i32 [[TMP203]], i64 [[TMP122]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]) +; POST-PROCESS-CPS-NEXT: unreachable +; +; +; POST-PROCESS-CPS-LABEL: define dso_local void @ClosestHit.resume.0( +; POST-PROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META25]] !lgc.cps [[META22]] !continuation [[META26]] { +; POST-PROCESS-CPS-NEXT: entryresume.0: +; POST-PROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], -116 +; POST-PROCESS-CPS-NEXT: [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 6 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 7 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 8 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 9 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 10 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 11 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 12 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 13 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 14 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 15 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 16 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 17 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 18 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 19 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 20 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 21 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 22 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 23 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 24 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 25 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 26 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 27 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 28 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP6]], 29 +; POST-PROCESS-CPS-NEXT: [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP7]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(21) [[TMP8]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP10:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP11]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(21) [[TMP12]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP15]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(21) [[TMP16]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP18:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP18]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP19]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(21) [[TMP20]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP22:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP22]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP23]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(21) [[TMP24]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP30:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP34:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP35:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP35]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP36]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP42:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP46:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP47:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP47]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP54:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP55:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP58:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP59:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP59]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP66:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP67:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP70:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP71:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP71]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP78:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP79:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP82:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP83:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP83]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP90:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP91:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP91]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP94:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP95:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP95]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP98:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP99:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP99]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP102:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP103:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP103]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP106:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP107:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP107]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP110:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP111:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP112:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP111]], i32 0 +; POST-PROCESS-CPS-NEXT: [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP112]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP114:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_EXTRACT254:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP114]], 0 +; POST-PROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) +; POST-PROCESS-CPS-NEXT: [[TMP115:%.*]] = add i32 [[TMP5]], 112 +; POST-PROCESS-CPS-NEXT: [[TMP116:%.*]] = inttoptr i32 [[TMP115]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP117:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP116]], i32 0 +; POST-PROCESS-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP117]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP118:%.*]] = add i32 [[TMP5]], 108 +; POST-PROCESS-CPS-NEXT: [[TMP119:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP120:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP119]], i32 0 +; POST-PROCESS-CPS-NEXT: [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP120]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP121:%.*]] = inttoptr i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP122:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP121]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP9]], ptr addrspace(21) [[TMP122]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP123:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 4 +; POST-PROCESS-CPS-NEXT: [[TMP124:%.*]] = inttoptr i32 [[TMP123]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP125:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP124]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP13]], ptr addrspace(21) [[TMP125]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP126:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 8 +; POST-PROCESS-CPS-NEXT: [[TMP127:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP128:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP127]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP17]], ptr addrspace(21) [[TMP128]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP129:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 12 +; POST-PROCESS-CPS-NEXT: [[TMP130:%.*]] = inttoptr i32 [[TMP129]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP131:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP130]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP21]], ptr addrspace(21) [[TMP131]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP132:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 16 +; POST-PROCESS-CPS-NEXT: [[TMP133:%.*]] = inttoptr i32 [[TMP132]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP134:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP133]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP25]], ptr addrspace(21) [[TMP134]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP135:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 20 +; POST-PROCESS-CPS-NEXT: [[TMP136:%.*]] = inttoptr i32 [[TMP135]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP137:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP136]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP29]], ptr addrspace(21) [[TMP137]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP138:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 24 +; POST-PROCESS-CPS-NEXT: [[TMP139:%.*]] = inttoptr i32 [[TMP138]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP139]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP33]], ptr addrspace(21) [[TMP140]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP141:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 28 +; POST-PROCESS-CPS-NEXT: [[TMP142:%.*]] = inttoptr i32 [[TMP141]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP143:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP142]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP37]], ptr addrspace(21) [[TMP143]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP144:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 32 +; POST-PROCESS-CPS-NEXT: [[TMP145:%.*]] = inttoptr i32 [[TMP144]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP146:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP145]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP41]], ptr addrspace(21) [[TMP146]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP147:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 36 +; POST-PROCESS-CPS-NEXT: [[TMP148:%.*]] = inttoptr i32 [[TMP147]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP149:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP148]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP45]], ptr addrspace(21) [[TMP149]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP150:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 40 +; POST-PROCESS-CPS-NEXT: [[TMP151:%.*]] = inttoptr i32 [[TMP150]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP151]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP49]], ptr addrspace(21) [[TMP152]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP153:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 44 +; POST-PROCESS-CPS-NEXT: [[TMP154:%.*]] = inttoptr i32 [[TMP153]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP154]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP53]], ptr addrspace(21) [[TMP155]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP156:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 48 +; POST-PROCESS-CPS-NEXT: [[TMP157:%.*]] = inttoptr i32 [[TMP156]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP157]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP57]], ptr addrspace(21) [[TMP158]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP159:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 52 +; POST-PROCESS-CPS-NEXT: [[TMP160:%.*]] = inttoptr i32 [[TMP159]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP160]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP61]], ptr addrspace(21) [[TMP161]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP162:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 56 +; POST-PROCESS-CPS-NEXT: [[TMP163:%.*]] = inttoptr i32 [[TMP162]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP164:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP163]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP65]], ptr addrspace(21) [[TMP164]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP165:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 60 +; POST-PROCESS-CPS-NEXT: [[TMP166:%.*]] = inttoptr i32 [[TMP165]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP167:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP166]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP69]], ptr addrspace(21) [[TMP167]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP168:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 64 +; POST-PROCESS-CPS-NEXT: [[TMP169:%.*]] = inttoptr i32 [[TMP168]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP170:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP169]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP73]], ptr addrspace(21) [[TMP170]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP171:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 68 +; POST-PROCESS-CPS-NEXT: [[TMP172:%.*]] = inttoptr i32 [[TMP171]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP173:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP172]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP77]], ptr addrspace(21) [[TMP173]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP174:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 72 +; POST-PROCESS-CPS-NEXT: [[TMP175:%.*]] = inttoptr i32 [[TMP174]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP176:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP175]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP81]], ptr addrspace(21) [[TMP176]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP177:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 76 +; POST-PROCESS-CPS-NEXT: [[TMP178:%.*]] = inttoptr i32 [[TMP177]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP179:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP178]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP85]], ptr addrspace(21) [[TMP179]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP180:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 80 +; POST-PROCESS-CPS-NEXT: [[TMP181:%.*]] = inttoptr i32 [[TMP180]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP182:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP181]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP89]], ptr addrspace(21) [[TMP182]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP183:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 84 +; POST-PROCESS-CPS-NEXT: [[TMP184:%.*]] = inttoptr i32 [[TMP183]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP185:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP184]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP93]], ptr addrspace(21) [[TMP185]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP186:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 88 +; POST-PROCESS-CPS-NEXT: [[TMP187:%.*]] = inttoptr i32 [[TMP186]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP188:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP187]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP97]], ptr addrspace(21) [[TMP188]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP189:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 92 +; POST-PROCESS-CPS-NEXT: [[TMP190:%.*]] = inttoptr i32 [[TMP189]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP191:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP190]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP101]], ptr addrspace(21) [[TMP191]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP192:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 96 +; POST-PROCESS-CPS-NEXT: [[TMP193:%.*]] = inttoptr i32 [[TMP192]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP194:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP193]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP105]], ptr addrspace(21) [[TMP194]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP195:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 100 +; POST-PROCESS-CPS-NEXT: [[TMP196:%.*]] = inttoptr i32 [[TMP195]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP197:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP196]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP109]], ptr addrspace(21) [[TMP197]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP198:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 104 +; POST-PROCESS-CPS-NEXT: [[TMP199:%.*]] = inttoptr i32 [[TMP198]] to ptr addrspace(21) +; POST-PROCESS-CPS-NEXT: [[TMP200:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP199]], i32 0 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP113]], ptr addrspace(21) [[TMP200]], align 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_INSERT253:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT254]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 [[DOTFCA_2_EXTRACT]], 2 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 [[DOTFCA_3_EXTRACT]], 3 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 [[DOTFCA_4_EXTRACT]], 4 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 [[DOTFCA_5_EXTRACT]], 5 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 [[DOTFCA_6_EXTRACT]], 6 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 [[DOTFCA_7_EXTRACT]], 7 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 [[DOTFCA_8_EXTRACT]], 8 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 [[DOTFCA_9_EXTRACT]], 9 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 [[DOTFCA_10_EXTRACT]], 10 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 [[DOTFCA_11_EXTRACT]], 11 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 [[DOTFCA_12_EXTRACT]], 12 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 [[DOTFCA_13_EXTRACT]], 13 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 [[DOTFCA_14_EXTRACT]], 14 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 [[DOTFCA_15_EXTRACT]], 15 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 [[DOTFCA_16_EXTRACT]], 16 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 [[DOTFCA_17_EXTRACT]], 17 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 [[DOTFCA_18_EXTRACT]], 18 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 [[DOTFCA_19_EXTRACT]], 19 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 [[DOTFCA_20_EXTRACT]], 20 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 [[DOTFCA_21_EXTRACT]], 21 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 [[DOTFCA_22_EXTRACT]], 22 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 [[DOTFCA_23_EXTRACT]], 23 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 [[DOTFCA_24_EXTRACT]], 24 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 [[DOTFCA_25_EXTRACT]], 25 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 [[DOTFCA_26_EXTRACT]], 26 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[DOTFCA_27_EXTRACT]], 27 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28 +; POST-PROCESS-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 +; POST-PROCESS-CPS-NEXT: [[TMP201:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP202:%.*]] = add i32 [[TMP201]], -116 +; POST-PROCESS-CPS-NEXT: store i32 [[TMP202]], ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: [[TMP203:%.*]] = zext i32 [[RETURNADDR_RELOAD]] to i64 +; POST-PROCESS-CPS-NEXT: [[TMP204:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP203]], i32 [[TMP204]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT253]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POST-PROCESS-CPS-NEXT: unreachable +; +; +; POST-PROCESS-GLOBAL-CPS-LABEL: define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes( +; POST-PROCESS-GLOBAL-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; POST-PROCESS-GLOBAL-CPS-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[VAL:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], ptr [[ADDR]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: ret [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]] +; +; +; POST-PROCESS-GLOBAL-CPS-LABEL: define i32 @_cont_GetLocalRootIndex( +; POST-PROCESS-GLOBAL-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; POST-PROCESS-GLOBAL-CPS-NEXT: ret i32 5 +; +; +; POST-PROCESS-GLOBAL-CPS-LABEL: define void @main( +; POST-PROCESS-GLOBAL-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] { +; POST-PROCESS-GLOBAL-CPS-NEXT: AllocaSpillBB: +; POST-PROCESS-GLOBAL-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase() +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 108 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP4]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_EXTRACT56:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP5:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP6:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP5]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT56]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP10:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP10]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP3]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP11]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP12:%.*]] = add i32 [[TMP3]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP12]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP13]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP14]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP15]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP16:%.*]] = add i32 [[TMP3]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP16]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP17]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP18:%.*]] = add i32 [[TMP3]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP18]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP19]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP20:%.*]] = add i32 [[TMP3]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP20]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP21]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP22:%.*]] = add i32 [[TMP3]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP22]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP23]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP24:%.*]] = add i32 [[TMP3]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP24]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP25]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP26:%.*]] = add i32 [[TMP3]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP26]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP27]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP28:%.*]] = add i32 [[TMP3]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP28]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP29]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP30:%.*]] = add i32 [[TMP3]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP30]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP31]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP32:%.*]] = add i32 [[TMP3]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP32]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP33]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP34:%.*]] = add i32 [[TMP3]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP34]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP35]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP36:%.*]] = add i32 [[TMP3]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP36]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP37]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP38:%.*]] = add i32 [[TMP3]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP38]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP39]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP40:%.*]] = add i32 [[TMP3]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP40]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP41]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP42:%.*]] = add i32 [[TMP3]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP42]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP43]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP44:%.*]] = add i32 [[TMP3]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP44]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP45]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP46:%.*]] = add i32 [[TMP3]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP46]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP47]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP48:%.*]] = add i32 [[TMP3]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP48]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP49]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP50:%.*]] = add i32 [[TMP3]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP50]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP51]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP52:%.*]] = add i32 [[TMP3]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP52]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP53]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP54:%.*]] = add i32 [[TMP3]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP54]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP55]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP56:%.*]] = add i32 [[TMP3]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP56]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP57]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP58:%.*]] = add i32 [[TMP3]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP58]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP59]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP60:%.*]] = add i32 [[TMP3]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP60]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP61]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP62:%.*]] = add i32 [[TMP3]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP62]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 undef, ptr addrspace(22) [[TMP63]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[TMP3]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 undef, 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 undef, 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 undef, 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 undef, 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 undef, 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 undef, 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 undef, 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 undef, 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 undef, 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 undef, 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 undef, 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 undef, 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 undef, 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 undef, 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 undef, 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 undef, 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 undef, 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 undef, 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 undef, 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 undef, 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 undef, 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 undef, 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 undef, 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP64:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 4, i64 -1, i32 [[TMP64]], i64 [[TMP10]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: unreachable +; +; +; POST-PROCESS-GLOBAL-CPS-LABEL: define dso_local void @main.resume.0( +; POST-PROCESS-GLOBAL-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META19]] !continuation [[META20]] { +; POST-PROCESS-GLOBAL-CPS-NEXT: entryresume.0: +; POST-PROCESS-GLOBAL-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP4:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase() +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(22) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -108 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[DOTFCA_0_EXTRACT]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(22) [[TMP9]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP11:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP11]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(22) [[TMP12]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP14]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(22) [[TMP15]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP17:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP17]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(22) [[TMP18]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP20:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP20]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(22) [[TMP21]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP23:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP23]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(22) [[TMP24]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP26]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(22) [[TMP27]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP29:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP29]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(22) [[TMP30]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP32:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP32]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP34:%.*]] = load i32, ptr addrspace(22) [[TMP33]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP35:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP35]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(22) [[TMP36]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP38]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP40:%.*]] = load i32, ptr addrspace(22) [[TMP39]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP41:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP41]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr addrspace(22) [[TMP42]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP44:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP44]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP46:%.*]] = load i32, ptr addrspace(22) [[TMP45]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP47:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP47]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(22) [[TMP48]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP50]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP52:%.*]] = load i32, ptr addrspace(22) [[TMP51]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP53:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP53]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(22) [[TMP54]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP56:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP56]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP58:%.*]] = load i32, ptr addrspace(22) [[TMP57]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP59:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP59]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP61:%.*]] = load i32, ptr addrspace(22) [[TMP60]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP62]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP64:%.*]] = load i32, ptr addrspace(22) [[TMP63]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP65:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP65]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP67:%.*]] = load i32, ptr addrspace(22) [[TMP66]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP68:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP69:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP68]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP70:%.*]] = load i32, ptr addrspace(22) [[TMP69]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP71:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP72:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP71]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP73:%.*]] = load i32, ptr addrspace(22) [[TMP72]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP75:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP74]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP76:%.*]] = load i32, ptr addrspace(22) [[TMP75]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP77:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP78:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP77]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP79:%.*]] = load i32, ptr addrspace(22) [[TMP78]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP80:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP81:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP80]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP82:%.*]] = load i32, ptr addrspace(22) [[TMP81]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP83:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP84:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP83]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP85:%.*]] = load i32, ptr addrspace(22) [[TMP84]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP87:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP86]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP88:%.*]] = load i32, ptr addrspace(22) [[TMP87]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP89:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_EXTRACT57:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP89]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP90:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP91:%.*]] = add i32 [[TMP90]], -108 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP91]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: ret void +; +; +; POST-PROCESS-GLOBAL-CPS-LABEL: define void @AnyHit( +; POST-PROCESS-GLOBAL-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META23:![0-9]+]] !continuation [[META24:![0-9]+]] { +; POST-PROCESS-GLOBAL-CPS-NEXT: AllocaSpillBB: +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP0:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase() +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(22) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 0, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: store <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_0_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 0, 1, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: store <2 x float> [[SYSTEM_DATA_FCA_0_0_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_0_1_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 1, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: store float [[SYSTEM_DATA_FCA_0_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_1_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 1, 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[SYSTEM_DATA_FCA_0_1_1_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_1_1_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_2_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: store <3 x float> [[SYSTEM_DATA_FCA_0_2_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_2_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: store <3 x float> [[SYSTEM_DATA_FCA_0_3_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_3_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_4_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_4_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: store float [[SYSTEM_DATA_FCA_0_4_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_4_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_5_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 0, 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_5_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i64 [[SYSTEM_DATA_FCA_0_5_EXTRACT]], ptr [[SYSTEM_DATA_FCA_0_5_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 1, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: store float [[SYSTEM_DATA_FCA_1_0_EXTRACT]], ptr [[SYSTEM_DATA_FCA_1_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_1_1_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], 1, 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[SYSTEM_DATA_FCA_1_1_EXTRACT]], ptr [[SYSTEM_DATA_FCA_1_1_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[PAYLOAD_FCA_0_EXTRACT]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(22) [[TMP2]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP4:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP4]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(22) [[TMP5]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP7:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP7]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(22) [[TMP8]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP10:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP10]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(22) [[TMP11]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP13:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP13]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(22) [[TMP14]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP16:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP16]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(22) [[TMP17]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP19:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP19]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(22) [[TMP20]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP22:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP22]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(22) [[TMP23]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP25:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP25]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(22) [[TMP26]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP28:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP28]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP29]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP31:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP31]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP32]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP34:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP34]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP35]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP37:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP37]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP38]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP40:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP40]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP41]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP43:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP43]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP44]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP46:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP46]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP47]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP49:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP49]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP50]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP52:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP52]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP53]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP55:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP55]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(22) [[TMP56]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP58:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP58]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP60:%.*]] = load i32, ptr addrspace(22) [[TMP59]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP61:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP61]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP63:%.*]] = load i32, ptr addrspace(22) [[TMP62]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP64:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP64]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP66:%.*]] = load i32, ptr addrspace(22) [[TMP65]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP67:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP67]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP69:%.*]] = load i32, ptr addrspace(22) [[TMP68]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP70:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP71:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP70]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP72:%.*]] = load i32, ptr addrspace(22) [[TMP71]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP73:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP74:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP73]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP75:%.*]] = load i32, ptr addrspace(22) [[TMP74]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP76:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP77:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP76]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP78:%.*]] = load i32, ptr addrspace(22) [[TMP77]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP79:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP80:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP79]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP81:%.*]] = load i32, ptr addrspace(22) [[TMP80]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP82:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP82]], i32 0, i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[VAL_I_FCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[ADDR_I]], i32 0, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[VAL_I_FCA_0_LOAD:%.*]] = load <2 x float>, ptr [[VAL_I_FCA_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[VAL_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[VAL_I_FCA_0_LOAD]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I_FCA_0_INSERT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTSROA_025_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP83:%.*]] = bitcast float [[DOTSROA_025_0_VEC_EXTRACT]] to i32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTSROA_025_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP84:%.*]] = bitcast float [[DOTSROA_025_4_VEC_EXTRACT]] to i32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[HIT_ATTRS_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[HIT_ATTRS]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP85:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[PAYLOAD_FCA_0_EXTRACT]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP3]], ptr addrspace(22) [[TMP85]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP86:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP87:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP86]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP6]], ptr addrspace(22) [[TMP87]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP88:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP89:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP88]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP9]], ptr addrspace(22) [[TMP89]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP90:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP91:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP90]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP12]], ptr addrspace(22) [[TMP91]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP92:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP92]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP15]], ptr addrspace(22) [[TMP93]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP94:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP95:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP94]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP18]], ptr addrspace(22) [[TMP95]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP96:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP97:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP96]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP21]], ptr addrspace(22) [[TMP97]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP98:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP99:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP98]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP24]], ptr addrspace(22) [[TMP99]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP100:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP101:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP100]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP27]], ptr addrspace(22) [[TMP101]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP102:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP103:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP102]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP30]], ptr addrspace(22) [[TMP103]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP104:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP105:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP104]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP33]], ptr addrspace(22) [[TMP105]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP106:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP107:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP106]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP36]], ptr addrspace(22) [[TMP107]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP108:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP109:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP108]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP39]], ptr addrspace(22) [[TMP109]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP110:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP111:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP110]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP42]], ptr addrspace(22) [[TMP111]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP112:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP113:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP112]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP45]], ptr addrspace(22) [[TMP113]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP114:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP115:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP114]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP48]], ptr addrspace(22) [[TMP115]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP116:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP117:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP116]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP51]], ptr addrspace(22) [[TMP117]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP118:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP119:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP118]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP54]], ptr addrspace(22) [[TMP119]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP120:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP121:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP120]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP57]], ptr addrspace(22) [[TMP121]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP122:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP123:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP122]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP60]], ptr addrspace(22) [[TMP123]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP124:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP125:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP124]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP63]], ptr addrspace(22) [[TMP125]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP126:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP127:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP126]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP66]], ptr addrspace(22) [[TMP127]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP128:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP129:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP128]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP69]], ptr addrspace(22) [[TMP129]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP130:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP131:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP130]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP72]], ptr addrspace(22) [[TMP131]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP132:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP133:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP132]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP75]], ptr addrspace(22) [[TMP133]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP134:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP135:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP134]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP78]], ptr addrspace(22) [[TMP135]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP136:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP137:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP136]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP81]], ptr addrspace(22) [[TMP137]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP138:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP139:%.*]] = bitcast i32 [[TMP138]] to float +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTSROA_027_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP139]], i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP140:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP141:%.*]] = bitcast i32 [[TMP140]] to float +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTSROA_027_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_027_0_VEC_INSERT]], float [[TMP141]], i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_INSERT26:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_027_4_VEC_INSERT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP142:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void @_cont_SetTriangleHitAttributes(ptr [[TMP142]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT26]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_0_0_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_0_0_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_0_LOAD]], 0, 0, 0, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_0_1_0_LOAD:%.*]] = load <2 x float>, ptr [[DOTFCA_0_0_1_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_0_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_0_0_0_INSERT]], <2 x float> [[DOTFCA_0_0_1_0_LOAD]], 0, 0, 1, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_1_0_LOAD:%.*]] = load float, ptr [[DOTFCA_0_1_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_0_1_0_INSERT]], float [[DOTFCA_0_1_0_LOAD]], 0, 1, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1, i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_0_1_1_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_1_0_INSERT]], i32 [[DOTFCA_0_1_1_LOAD]], 0, 1, 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_2_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_2_LOAD:%.*]] = load <3 x float>, ptr [[DOTFCA_0_2_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_1_1_INSERT]], <3 x float> [[DOTFCA_0_2_LOAD]], 0, 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_3_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_3_LOAD:%.*]] = load <3 x float>, ptr [[DOTFCA_0_3_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_2_INSERT]], <3 x float> [[DOTFCA_0_3_LOAD]], 0, 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_4_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_4_LOAD:%.*]] = load float, ptr [[DOTFCA_0_4_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_4_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_3_INSERT]], float [[DOTFCA_0_4_LOAD]], 0, 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_5_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_5_LOAD:%.*]] = load i64, ptr [[DOTFCA_0_5_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_5_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_4_INSERT]], i64 [[DOTFCA_0_5_LOAD]], 0, 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_0_LOAD:%.*]] = load float, ptr [[DOTFCA_1_0_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_0_5_INSERT]], float [[DOTFCA_1_0_LOAD]], 1, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 [[PAYLOAD_FCA_10_EXTRACT]], 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 [[PAYLOAD_FCA_11_EXTRACT]], 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 [[PAYLOAD_FCA_12_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 [[PAYLOAD_FCA_13_EXTRACT]], 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 [[PAYLOAD_FCA_14_EXTRACT]], 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 [[PAYLOAD_FCA_15_EXTRACT]], 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 [[PAYLOAD_FCA_16_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 [[PAYLOAD_FCA_17_EXTRACT]], 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 [[PAYLOAD_FCA_18_EXTRACT]], 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 [[PAYLOAD_FCA_19_EXTRACT]], 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 [[PAYLOAD_FCA_20_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 [[PAYLOAD_FCA_21_EXTRACT]], 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 [[PAYLOAD_FCA_22_EXTRACT]], 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 [[PAYLOAD_FCA_23_EXTRACT]], 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 [[PAYLOAD_FCA_24_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 [[PAYLOAD_FCA_25_EXTRACT]], 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 [[PAYLOAD_FCA_26_EXTRACT]], 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP143:%.*]] = zext i32 [[RETURNADDR]] to i64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP144:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP143]], i32 [[TMP144]], i64 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: unreachable +; +; +; POST-PROCESS-GLOBAL-CPS-LABEL: define void @ClosestHit( +; POST-PROCESS-GLOBAL-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [21 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !lgc.cps [[META22]] !continuation [[META26:![0-9]+]] !continuation.stacksize [[META21]] { +; POST-PROCESS-GLOBAL-CPS-NEXT: AllocaSpillBB: +; POST-PROCESS-GLOBAL-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP0:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase() +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(22) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 116 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP3]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 108 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP4]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[RETURNADDR]], ptr addrspace(22) [[TMP5]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], 112 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP6]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[PAYLOAD_FCA_0_EXTRACT]], ptr addrspace(22) [[TMP7]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 0, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYSTEM_DATA_FCA_1_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], 1, 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[PAYLOAD_FCA_0_EXTRACT]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(22) [[TMP8]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP10:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP10]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(22) [[TMP11]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP13:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP13]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(22) [[TMP14]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP16:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP16]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(22) [[TMP17]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP19:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP19]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(22) [[TMP20]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP22:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP22]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(22) [[TMP23]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP25:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP25]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(22) [[TMP26]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP28:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP28]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP29]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP31:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP31]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP32]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP34:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP34]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP35]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP37:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP37]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP38]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP40:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP40]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP41]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP43:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP43]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP44]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP46:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP46]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP47]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP49:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP49]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP50]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP52:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP52]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP53]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP55:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP55]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(22) [[TMP56]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP58:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP58]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP60:%.*]] = load i32, ptr addrspace(22) [[TMP59]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP61:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP61]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP63:%.*]] = load i32, ptr addrspace(22) [[TMP62]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP64:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP64]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP66:%.*]] = load i32, ptr addrspace(22) [[TMP65]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP67:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP67]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP69:%.*]] = load i32, ptr addrspace(22) [[TMP68]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP70:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP71:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP70]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP72:%.*]] = load i32, ptr addrspace(22) [[TMP71]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP73:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP74:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP73]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP75:%.*]] = load i32, ptr addrspace(22) [[TMP74]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP76:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP77:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP76]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP78:%.*]] = load i32, ptr addrspace(22) [[TMP77]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP79:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP80:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP79]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP81:%.*]] = load i32, ptr addrspace(22) [[TMP80]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP82:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP83:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP82]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP84:%.*]] = load i32, ptr addrspace(22) [[TMP83]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP85:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP86:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP85]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP87:%.*]] = load i32, ptr addrspace(22) [[TMP86]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[VAL_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> [[SYSTEM_DATA_FCA_1_0_EXTRACT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I_FCA_0_INSERT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTSROA_0257_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP88:%.*]] = bitcast float [[DOTSROA_0257_0_VEC_EXTRACT]] to i32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTSROA_0257_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VAL_I_FCA_0_INSERT_FCA_0_EXTRACT]], i32 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP89:%.*]] = bitcast float [[DOTSROA_0257_4_VEC_EXTRACT]] to i32 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP90:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP91:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP92:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP90]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP93:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP92]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP94:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP93]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP95:%.*]] = call i64 @continuation.getAddrAndMD(ptr @ClosestHit.resume.0) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP95]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP2]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP9]], ptr addrspace(22) [[TMP96]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP97:%.*]] = add i32 [[TMP2]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP98:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP97]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP12]], ptr addrspace(22) [[TMP98]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP99:%.*]] = add i32 [[TMP2]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP99]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP15]], ptr addrspace(22) [[TMP100]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP101:%.*]] = add i32 [[TMP2]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP102:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP101]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP18]], ptr addrspace(22) [[TMP102]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP103:%.*]] = add i32 [[TMP2]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP104:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP103]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP21]], ptr addrspace(22) [[TMP104]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP105:%.*]] = add i32 [[TMP2]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP106:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP105]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP24]], ptr addrspace(22) [[TMP106]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP107:%.*]] = add i32 [[TMP2]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP108:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP107]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP27]], ptr addrspace(22) [[TMP108]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP109:%.*]] = add i32 [[TMP2]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP110:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP109]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP30]], ptr addrspace(22) [[TMP110]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP111:%.*]] = add i32 [[TMP2]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP112:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP111]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP33]], ptr addrspace(22) [[TMP112]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP113:%.*]] = add i32 [[TMP2]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP114:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP113]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP36]], ptr addrspace(22) [[TMP114]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP115:%.*]] = add i32 [[TMP2]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP116:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP115]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP39]], ptr addrspace(22) [[TMP116]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP117:%.*]] = add i32 [[TMP2]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP118:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP117]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP42]], ptr addrspace(22) [[TMP118]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP119:%.*]] = add i32 [[TMP2]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP120:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP119]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP45]], ptr addrspace(22) [[TMP120]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP121:%.*]] = add i32 [[TMP2]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP122:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP121]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP48]], ptr addrspace(22) [[TMP122]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP123:%.*]] = add i32 [[TMP2]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP124:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP123]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP51]], ptr addrspace(22) [[TMP124]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP125:%.*]] = add i32 [[TMP2]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP126:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP125]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP54]], ptr addrspace(22) [[TMP126]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP127:%.*]] = add i32 [[TMP2]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP128:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP127]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP57]], ptr addrspace(22) [[TMP128]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP129:%.*]] = add i32 [[TMP2]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP130:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP129]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP60]], ptr addrspace(22) [[TMP130]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP131:%.*]] = add i32 [[TMP2]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP132:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP131]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP63]], ptr addrspace(22) [[TMP132]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP133:%.*]] = add i32 [[TMP2]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP134:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP133]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP66]], ptr addrspace(22) [[TMP134]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP135:%.*]] = add i32 [[TMP2]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP136:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP135]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP69]], ptr addrspace(22) [[TMP136]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP137:%.*]] = add i32 [[TMP2]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP138:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP137]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP72]], ptr addrspace(22) [[TMP138]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP139:%.*]] = add i32 [[TMP2]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP139]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP75]], ptr addrspace(22) [[TMP140]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP141:%.*]] = add i32 [[TMP2]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP142:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP141]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP78]], ptr addrspace(22) [[TMP142]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP143:%.*]] = add i32 [[TMP2]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP143]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP81]], ptr addrspace(22) [[TMP144]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP145:%.*]] = add i32 [[TMP2]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP146:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP145]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP84]], ptr addrspace(22) [[TMP146]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP147:%.*]] = add i32 [[TMP2]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP148:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP1]], i32 [[TMP147]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP87]], ptr addrspace(22) [[TMP148]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_INSERT54:%.*]] = insertvalue [30 x i32] poison, i32 [[TMP2]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_INSERT57:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT54]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_2_INSERT60:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT57]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_3_INSERT63:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT60]], i32 [[PAYLOAD_FCA_3_EXTRACT]], 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_4_INSERT66:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT63]], i32 [[PAYLOAD_FCA_4_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_5_INSERT69:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT66]], i32 [[PAYLOAD_FCA_5_EXTRACT]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_6_INSERT72:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT69]], i32 [[PAYLOAD_FCA_6_EXTRACT]], 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_7_INSERT75:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT72]], i32 [[PAYLOAD_FCA_7_EXTRACT]], 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_8_INSERT78:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT75]], i32 [[PAYLOAD_FCA_8_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_9_INSERT81:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT78]], i32 [[PAYLOAD_FCA_9_EXTRACT]], 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_10_INSERT84:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT81]], i32 [[PAYLOAD_FCA_10_EXTRACT]], 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_11_INSERT87:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT84]], i32 [[PAYLOAD_FCA_11_EXTRACT]], 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_12_INSERT90:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT87]], i32 [[PAYLOAD_FCA_12_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_13_INSERT93:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT90]], i32 [[PAYLOAD_FCA_13_EXTRACT]], 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_14_INSERT96:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT93]], i32 [[PAYLOAD_FCA_14_EXTRACT]], 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_15_INSERT99:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT96]], i32 [[PAYLOAD_FCA_15_EXTRACT]], 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_16_INSERT102:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT99]], i32 [[PAYLOAD_FCA_16_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_17_INSERT105:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT102]], i32 [[PAYLOAD_FCA_17_EXTRACT]], 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_18_INSERT108:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT105]], i32 [[PAYLOAD_FCA_18_EXTRACT]], 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_19_INSERT111:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT108]], i32 [[PAYLOAD_FCA_19_EXTRACT]], 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_20_INSERT114:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT111]], i32 [[PAYLOAD_FCA_20_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_21_INSERT117:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT114]], i32 [[PAYLOAD_FCA_21_EXTRACT]], 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_22_INSERT120:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT117]], i32 [[PAYLOAD_FCA_22_EXTRACT]], 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_23_INSERT123:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT120]], i32 [[PAYLOAD_FCA_23_EXTRACT]], 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_24_INSERT126:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT123]], i32 [[PAYLOAD_FCA_24_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_25_INSERT129:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT126]], i32 [[PAYLOAD_FCA_25_EXTRACT]], 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_26_INSERT132:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT129]], i32 [[PAYLOAD_FCA_26_EXTRACT]], 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_27_INSERT135:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT132]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_28_INSERT138:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT135]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_29_INSERT141:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT138]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP149:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 4, i64 -1, i32 [[TMP149]], i64 [[TMP95]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [10 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT141]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: unreachable +; +; +; POST-PROCESS-GLOBAL-CPS-LABEL: define dso_local void @ClosestHit.resume.0( +; POST-PROCESS-GLOBAL-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [23 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META25]] !lgc.cps [[META22]] !continuation [[META26]] { +; POST-PROCESS-GLOBAL-CPS-NEXT: entryresume.0: +; POST-PROCESS-GLOBAL-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP4:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase() +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(22) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -116 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_7_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_8_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_9_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_10_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_11_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_12_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_13_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_14_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_15_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_16_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_17_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_18_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_19_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_20_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_21_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_22_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_23_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_24_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_25_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_26_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_27_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_28_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_29_EXTRACT:%.*]] = extractvalue [30 x i32] [[TMP8]], 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[DOTFCA_0_EXTRACT]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(22) [[TMP9]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP11:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP11]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(22) [[TMP12]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP14:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP14]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(22) [[TMP15]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP17:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP17]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(22) [[TMP18]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP20:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP20]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(22) [[TMP21]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP23:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP23]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(22) [[TMP24]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP26:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP26]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(22) [[TMP27]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP29:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP29]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(22) [[TMP30]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP32:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP32]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP34:%.*]] = load i32, ptr addrspace(22) [[TMP33]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP35:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP35]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(22) [[TMP36]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP38:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP38]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP40:%.*]] = load i32, ptr addrspace(22) [[TMP39]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP41:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP41]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP43:%.*]] = load i32, ptr addrspace(22) [[TMP42]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP44:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP44]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP46:%.*]] = load i32, ptr addrspace(22) [[TMP45]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP47:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP47]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(22) [[TMP48]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP50:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP50]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP52:%.*]] = load i32, ptr addrspace(22) [[TMP51]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP53:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP53]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(22) [[TMP54]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP56:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP56]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP58:%.*]] = load i32, ptr addrspace(22) [[TMP57]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP59:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP59]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP61:%.*]] = load i32, ptr addrspace(22) [[TMP60]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP62:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP62]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP64:%.*]] = load i32, ptr addrspace(22) [[TMP63]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP65:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP65]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP67:%.*]] = load i32, ptr addrspace(22) [[TMP66]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP68:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP69:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP68]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP70:%.*]] = load i32, ptr addrspace(22) [[TMP69]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP71:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP72:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP71]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP73:%.*]] = load i32, ptr addrspace(22) [[TMP72]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP74:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP75:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP74]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP76:%.*]] = load i32, ptr addrspace(22) [[TMP75]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP77:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP78:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP77]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP79:%.*]] = load i32, ptr addrspace(22) [[TMP78]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP80:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP81:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP80]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP82:%.*]] = load i32, ptr addrspace(22) [[TMP81]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP83:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP84:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP83]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP85:%.*]] = load i32, ptr addrspace(22) [[TMP84]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP86:%.*]] = add i32 [[DOTFCA_0_EXTRACT]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP87:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP86]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP88:%.*]] = load i32, ptr addrspace(22) [[TMP87]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP89:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [23 x i32], [30 x i32] } [[TMP3]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_EXTRACT254:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP89]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP90:%.*]] = add i32 [[TMP7]], 112 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP91:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP90]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT_RELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP91]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP92:%.*]] = add i32 [[TMP7]], 108 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP92]] +; POST-PROCESS-GLOBAL-CPS-NEXT: [[RETURNADDR_RELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP93]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP10]], ptr addrspace(22) [[TMP94]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP95:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP95]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP13]], ptr addrspace(22) [[TMP96]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP97:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP98:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP97]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP16]], ptr addrspace(22) [[TMP98]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP99:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP99]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP19]], ptr addrspace(22) [[TMP100]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP101:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP102:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP101]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP22]], ptr addrspace(22) [[TMP102]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP103:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP104:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP103]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP25]], ptr addrspace(22) [[TMP104]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP105:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP106:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP105]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP28]], ptr addrspace(22) [[TMP106]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP107:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP108:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP107]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP31]], ptr addrspace(22) [[TMP108]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP109:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 32 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP110:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP109]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP34]], ptr addrspace(22) [[TMP110]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP111:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 36 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP112:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP111]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP37]], ptr addrspace(22) [[TMP112]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP113:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 40 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP114:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP113]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP40]], ptr addrspace(22) [[TMP114]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP115:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 44 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP116:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP115]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP43]], ptr addrspace(22) [[TMP116]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP117:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 48 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP118:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP117]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP46]], ptr addrspace(22) [[TMP118]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP119:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 52 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP120:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP119]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP49]], ptr addrspace(22) [[TMP120]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP121:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 56 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP122:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP121]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP52]], ptr addrspace(22) [[TMP122]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP123:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 60 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP124:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP123]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP55]], ptr addrspace(22) [[TMP124]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP125:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP126:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP125]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP58]], ptr addrspace(22) [[TMP126]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP127:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 68 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP128:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP127]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP61]], ptr addrspace(22) [[TMP128]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP129:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 72 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP130:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP129]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP64]], ptr addrspace(22) [[TMP130]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP131:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 76 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP132:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP131]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP67]], ptr addrspace(22) [[TMP132]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP133:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 80 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP134:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP133]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP70]], ptr addrspace(22) [[TMP134]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP135:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 84 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP136:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP135]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP73]], ptr addrspace(22) [[TMP136]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP137:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 88 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP138:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP137]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP76]], ptr addrspace(22) [[TMP138]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP139:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 92 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP139]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP79]], ptr addrspace(22) [[TMP140]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP141:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 96 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP142:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP141]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP82]], ptr addrspace(22) [[TMP142]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP143:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 100 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP143]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP85]], ptr addrspace(22) [[TMP144]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP145:%.*]] = add i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 104 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP146:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP5]], i32 [[TMP145]] +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP88]], ptr addrspace(22) [[TMP146]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_INSERT253:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT254]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 [[DOTFCA_2_EXTRACT]], 2 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_2_INSERT]], i32 [[DOTFCA_3_EXTRACT]], 3 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_4_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_3_INSERT]], i32 [[DOTFCA_4_EXTRACT]], 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_5_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_4_INSERT]], i32 [[DOTFCA_5_EXTRACT]], 5 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_6_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_5_INSERT]], i32 [[DOTFCA_6_EXTRACT]], 6 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_6_INSERT]], i32 [[DOTFCA_7_EXTRACT]], 7 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_7_INSERT]], i32 [[DOTFCA_8_EXTRACT]], 8 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_8_INSERT]], i32 [[DOTFCA_9_EXTRACT]], 9 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_10_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_9_INSERT]], i32 [[DOTFCA_10_EXTRACT]], 10 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_11_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_10_INSERT]], i32 [[DOTFCA_11_EXTRACT]], 11 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_12_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_11_INSERT]], i32 [[DOTFCA_12_EXTRACT]], 12 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_13_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_12_INSERT]], i32 [[DOTFCA_13_EXTRACT]], 13 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_14_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_13_INSERT]], i32 [[DOTFCA_14_EXTRACT]], 14 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_15_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_14_INSERT]], i32 [[DOTFCA_15_EXTRACT]], 15 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_16_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_15_INSERT]], i32 [[DOTFCA_16_EXTRACT]], 16 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_17_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_16_INSERT]], i32 [[DOTFCA_17_EXTRACT]], 17 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_18_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_17_INSERT]], i32 [[DOTFCA_18_EXTRACT]], 18 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_19_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_18_INSERT]], i32 [[DOTFCA_19_EXTRACT]], 19 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_20_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_19_INSERT]], i32 [[DOTFCA_20_EXTRACT]], 20 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_21_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_20_INSERT]], i32 [[DOTFCA_21_EXTRACT]], 21 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_22_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_21_INSERT]], i32 [[DOTFCA_22_EXTRACT]], 22 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_23_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_22_INSERT]], i32 [[DOTFCA_23_EXTRACT]], 23 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_24_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_23_INSERT]], i32 [[DOTFCA_24_EXTRACT]], 24 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_25_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_24_INSERT]], i32 [[DOTFCA_25_EXTRACT]], 25 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_26_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_25_INSERT]], i32 [[DOTFCA_26_EXTRACT]], 26 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_27_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT]], i32 [[DOTFCA_27_EXTRACT]], 27 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP147:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP148:%.*]] = add i32 [[TMP147]], -116 +; POST-PROCESS-GLOBAL-CPS-NEXT: store i32 [[TMP148]], ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP149:%.*]] = zext i32 [[RETURNADDR_RELOAD]] to i64 +; POST-PROCESS-GLOBAL-CPS-NEXT: [[TMP150:%.*]] = load i32, ptr [[CSP]], align 4 +; POST-PROCESS-GLOBAL-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 [[TMP149]], i32 [[TMP150]], i64 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT253]], [23 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]) +; POST-PROCESS-GLOBAL-CPS-NEXT: unreachable +; diff --git a/llvmraytracing/test/dx/remat-intrinsic.ll b/llvmraytracing/test/dx/remat-intrinsic.ll index 8089dcc374..e51fe74b9a 100644 --- a/llvmraytracing/test/dx/remat-intrinsic.ll +++ b/llvmraytracing/test/dx/remat-intrinsic.ll @@ -143,10 +143,10 @@ attributes #1 = { nounwind } ; POSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 5) ; POSTPROCESS-NEXT: [[TMP6:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]]) ; POSTPROCESS-NEXT: [[I:%.*]] = extractelement <3 x i32> [[TMP6]], i8 0 -; POSTPROCESS-NEXT: [[UNPACKED:%.*]] = call [[DX_TYPES_FOURI32:%.*]] [[DX_OP_UNPACK4X8_I32:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 219, i8 1, i32 [[I]]) +; POSTPROCESS-NEXT: [[UNPACKED:%.*]] = call [[DX_TYPES_FOURI32:%.*]] @[[DX_OP_UNPACK4X8_I32:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 219, i8 1, i32 [[I]]) ; POSTPROCESS-NEXT: [[HANDLE0:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POSTPROCESS-NEXT: [[HANDLE1:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[HANDLE0]]) -; POSTPROCESS-NEXT: [[HANDLE2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[HANDLE1]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POSTPROCESS-NEXT: [[HANDLE1:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[HANDLE0]]) +; POSTPROCESS-NEXT: [[HANDLE2:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[HANDLE1]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POSTPROCESS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT9]], 0 ; POSTPROCESS-NEXT: [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; POSTPROCESS-NEXT: [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0) @@ -174,20 +174,20 @@ attributes #1 = { nounwind } ; POSTPROCESS-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP5]], i32 0 ; POSTPROCESS-NEXT: [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP6]], align 4 ; POSTPROCESS-NEXT: [[HANDLE011:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; POSTPROCESS-NEXT: [[HANDLE110:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[HANDLE011]]) -; POSTPROCESS-NEXT: [[HANDLE29:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[HANDLE110]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; POSTPROCESS-NEXT: [[HANDLE110:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[HANDLE011]]) +; POSTPROCESS-NEXT: [[HANDLE29:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[HANDLE110]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; POSTPROCESS-NEXT: [[TMP17:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP16]]) ; POSTPROCESS-NEXT: [[I8:%.*]] = extractelement <3 x i32> [[TMP17]], i8 0 -; POSTPROCESS-NEXT: [[UNPACKED7:%.*]] = call [[DX_TYPES_FOURI32:%.*]] [[DX_OP_UNPACK4X8_I32:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 219, i8 1, i32 [[I8]]) +; POSTPROCESS-NEXT: [[UNPACKED7:%.*]] = call [[DX_TYPES_FOURI32:%.*]] @[[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I8]]) ; POSTPROCESS-NEXT: [[TMP7:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP16]]) ; POSTPROCESS-NEXT: [[I6:%.*]] = extractelement <3 x i32> [[TMP7]], i8 0 -; POSTPROCESS-NEXT: [[UNPACKED5:%.*]] = call [[DX_TYPES_FOURI32]] [[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I6]]) +; POSTPROCESS-NEXT: [[UNPACKED5:%.*]] = call [[DX_TYPES_FOURI32]] @[[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I6]]) ; POSTPROCESS-NEXT: [[TMP8:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP16]]) ; POSTPROCESS-NEXT: [[I4:%.*]] = extractelement <3 x i32> [[TMP8]], i8 0 -; POSTPROCESS-NEXT: [[UNPACKED3:%.*]] = call [[DX_TYPES_FOURI32]] [[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I4]]) +; POSTPROCESS-NEXT: [[UNPACKED3:%.*]] = call [[DX_TYPES_FOURI32]] @[[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I4]]) ; POSTPROCESS-NEXT: [[TMP9:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP16]]) ; POSTPROCESS-NEXT: [[I2:%.*]] = extractelement <3 x i32> [[TMP9]], i8 0 -; POSTPROCESS-NEXT: [[UNPACKED1:%.*]] = call [[DX_TYPES_FOURI32]] [[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I2]]) +; POSTPROCESS-NEXT: [[UNPACKED1:%.*]] = call [[DX_TYPES_FOURI32]] @[[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I2]]) ; POSTPROCESS-NEXT: [[A:%.*]] = extractvalue [[DX_TYPES_FOURI32]] [[UNPACKED7]], 0 ; POSTPROCESS-NEXT: [[B:%.*]] = extractvalue [[DX_TYPES_FOURI32]] [[UNPACKED5]], 1 ; POSTPROCESS-NEXT: [[C:%.*]] = extractvalue [[DX_TYPES_FOURI32]] [[UNPACKED3]], 2 diff --git a/llvmraytracing/test/dx/stats-report-sizes.ll b/llvmraytracing/test/dx/stats-report-sizes.ll index ae343aba9a..4f36c59d7a 100644 --- a/llvmraytracing/test/dx/stats-report-sizes.ll +++ b/llvmraytracing/test/dx/stats-report-sizes.ll @@ -1,5 +1,5 @@ -; RUN: opt --report-cont-state-sizes --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-CONT-SIZES -; RUN: opt --report-payload-register-sizes --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-PAYLOAD-SIZES +; RUN: opt --report-cont-state-sizes --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-CONT-SIZES +; RUN: opt --report-payload-register-sizes=max --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-PAYLOAD-SIZES ; RUN: opt --report-system-data-sizes --verify-each -passes='continuations-stats-report,dxil-cont-post-process,lint,continuations-lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-SYSTEM-DATA-SIZES target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32" @@ -10,15 +10,15 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16: declare i32 @continuation.initialContinuationStackPtr() declare i32 @_cont_GetContinuationStackAddr() declare i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) -declare void @lgc.ilcps.continue(i64, ...) +declare void @lgc.cps.jump(...) ; REPORT-CONT-SIZES: Continuation state size of "RayGen" (raygeneration): 108 bytes -; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "RayGen" (raygeneration): 28 and 24 bytes +; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "RayGen" (raygeneration): 7 and 6 dwords define void @RayGen(i64 %dummyRetAddr, %struct.DispatchSystemData %0) !continuation.entry !0 !continuation !3 !continuation.state !5 !continuation.registercount !7 !lgc.rt.shaderstage !12 { %csp = alloca i32, align 4 %cspInit = call i32 @continuation.initialContinuationStackPtr() store i32 %cspInit, i32* %csp - call void (i64, ...) @lgc.ilcps.continue(i64 2, i32 poison, i64 poison), !continuation.registercount !6 + call void (...) @lgc.cps.jump(i64 2, i32 poison, {} poison, i64 poison), !continuation.registercount !6 ret void } @@ -29,10 +29,10 @@ define void @RayGen.resume.0(i64 %0, { %struct.DispatchSystemData } %1) !continu ret void } -; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "CHS" (closesthit): 32 and 36 bytes +; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "CHS" (closesthit): 8 and 9 dwords ; REPORT-SYSTEM-DATA-SIZES-DAG: Incoming system data of "CHS" (closesthit) is "struct.CHSSystemData", size: 400 bytes define void @CHS(i64 %returnAddr, %struct.CHSSystemData %0) !continuation !14 !continuation.registercount !8 !lgc.rt.shaderstage !13 { - call void (i64, ...) @lgc.ilcps.continue(i64 2, i32 poison, i64 poison), !continuation.registercount !9 + call void ( ...) @lgc.cps.jump(i64 2, i32 poison, {} poison, i64 poison), !continuation.registercount !9 ret void } diff --git a/llvmraytracing/test/dx/traceray.ll b/llvmraytracing/test/dx/traceray.ll index 41ed6ef32e..eac4b2f7d3 100644 --- a/llvmraytracing/test/dx/traceray.ll +++ b/llvmraytracing/test/dx/traceray.ll @@ -36,7 +36,7 @@ declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemD declare %struct.TraversalData @_AmdAwaitAnyHit(i64, %struct.TraversalData, float, i32) #0 -declare void @lgc.ilcps.continue(...) #0 +declare void @lgc.cps.jump(...) #0 declare void @_AmdContStackSetPtr(i32) #0 @@ -111,7 +111,7 @@ define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !poi define void @_cont_KernelEntry() #0 !lgc.rt.shaderstage !69 { %cspInit = ptrtoint ptr @debug_global to i32 call void @_AmdContStackSetPtr(i32 %cspInit) - call void (...) @lgc.ilcps.continue(i64 0, i32 poison, i64 undef, %struct.DispatchSystemData poison) + call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, %struct.DispatchSystemData poison) ret void } @@ -421,7 +421,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @_AmdContStackSetPtr(i32 [[CSPINIT]]) -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.continue(i64 0, i32 poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]] ; LOWERRAYTRACINGPIPELINE-NEXT: ret void ; ; @@ -452,8 +452,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0 @@ -476,7 +476,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP17]], ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META33:![0-9]+]], !continuation.wait.await [[META13]], !continuation.returnedRegistercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP42:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP41]]), !continuation.registercount [[META33:![0-9]+]], !waitmask [[META40:![0-9]+]], !continuation.returnedRegistercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP43:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } @await(ptr [[TMP42]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP43]], 2 ; LOWERRAYTRACINGPIPELINE-NEXT: store [10 x i32] [[TMP24]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 @@ -506,19 +506,20 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[TMP28]], i64 3 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP32]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP33]], float [[TMP34]], float [[TMP35]], float [[TMP36]], i8 15) ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]] -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META41:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META42:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 @@ -582,12 +583,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP45:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP44]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], [21 x i32] poison, [10 x i32] [[TMP47]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyAnyHitShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META43:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META44:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -684,7 +685,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP58]], ptr [[ADDR_I1]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP60:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP68:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP60]], [8 x i32] poison, [10 x i32] [[TMP68]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 59: ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> [[TMP25]], ptr [[TMP24]], align 4 @@ -716,12 +717,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP79]], ptr [[ADDR_I2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP81:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP78:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP81]], [8 x i32] poison, [10 x i32] [[TMP78]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META45:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META46:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4 @@ -772,18 +773,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 22: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 25: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP23:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShaderLargeAttrs( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !continuation.registercount [[META32]] !continuation [[META46:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !continuation.registercount [[META32]] !continuation [[META47:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP4:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4 @@ -867,18 +868,18 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE: 36: ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP31:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP38:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP31]], [8 x i32] poison, [30 x i32] [[TMP38]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; LOWERRAYTRACINGPIPELINE: 39: ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP33:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP41:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META32]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TMP33]], [8 x i32] poison, [30 x i32] [[TMP41]]), !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader( -; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META48:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META48:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META49:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 @@ -921,37 +922,37 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP28:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP27]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP29:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], [21 x i32] poison, [10 x i32] [[TMP29]]), !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; DXILCONTPOSTPROCESS-LABEL: define i1 @_cont_IsEndSearch( -; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: [[ISEND:%.*]] = call i1 @opaqueIsEnd() ; DXILCONTPOSTPROCESS-NEXT: ret i1 [[ISEND]] ; ; ; DXILCONTPOSTPROCESS-LABEL: define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes( -; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-NEXT: [[VAL:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], ptr [[ADDR]], align 4 ; DXILCONTPOSTPROCESS-NEXT: ret [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]] ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @_cont_SetTriangleHitAttributes( -; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[VAL:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[VAL:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]], ptr [[ADDR]], align 4 ; DXILCONTPOSTPROCESS-NEXT: ret void ; ; ; DXILCONTPOSTPROCESS-LABEL: define i32 @_cont_GetLocalRootIndex( -; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-NEXT: ret i32 5 ; ; ; DXILCONTPOSTPROCESS-LABEL: define void @_cont_KernelEntry( -; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { +; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { ; DXILCONTPOSTPROCESS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; DXILCONTPOSTPROCESS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -961,7 +962,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-LABEL: define %struct.HitData @_cont_GetCandidateState( -; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-NEXT: [[RESPTR:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-NEXT: [[RES:%.*]] = load [[STRUCT_HITDATA:%.*]], ptr [[RESPTR]], align 4 ; DXILCONTPOSTPROCESS-NEXT: ret [[STRUCT_HITDATA]] [[RES]] @@ -983,8 +984,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; DXILCONTPOSTPROCESS-NEXT: [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; DXILCONTPOSTPROCESS-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; DXILCONTPOSTPROCESS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) -; DXILCONTPOSTPROCESS-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; DXILCONTPOSTPROCESS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; DXILCONTPOSTPROCESS-NEXT: [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP4]]) ; DXILCONTPOSTPROCESS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0 ; DXILCONTPOSTPROCESS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -1049,8 +1050,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP10]], i8 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]]) ; DXILCONTPOSTPROCESS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP11]], i8 1 -; DXILCONTPOSTPROCESS-NEXT: [[TMP21:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]]) -; DXILCONTPOSTPROCESS-NEXT: [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP21]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) +; DXILCONTPOSTPROCESS-NEXT: [[TMP21:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]]) +; DXILCONTPOSTPROCESS-NEXT: [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP21]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) ; DXILCONTPOSTPROCESS-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0 ; DXILCONTPOSTPROCESS-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1 ; DXILCONTPOSTPROCESS-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2 @@ -2122,32 +2123,32 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define i1 @_cont_IsEndSearch( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[ISEND:%.*]] = call i1 @opaqueIsEnd() ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: ret i1 [[ISEND]] ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[VAL:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], ptr [[ADDR]], align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: ret [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]] ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @_cont_SetTriangleHitAttributes( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[VAL:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[VAL:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]], ptr [[ADDR]], align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: ret void ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define i32 @_cont_GetLocalRootIndex( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: ret i32 5 ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @_cont_KernelEntry( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase() ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22) @@ -2159,7 +2160,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define %struct.HitData @_cont_GetCandidateState( -; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-GLOBAL-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[RESPTR:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[RES:%.*]] = load [[STRUCT_HITDATA:%.*]], ptr [[RESPTR]], align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: ret [[STRUCT_HITDATA]] [[RES]] @@ -2183,8 +2184,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP7:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP6]]) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -2251,8 +2252,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP12]], i8 0 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP13:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]]) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP13]], i8 1 -; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP23:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) -; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP23]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) +; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP23:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) +; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP23]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2 @@ -3357,10 +3358,10 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @_cont_KernelEntry( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @_AmdContStackSetPtr(i32 [[CSPINIT]]) -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 0, i32 poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: ret void ; ; @@ -3379,7 +3380,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyRayGen( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 @@ -3391,8 +3392,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0 @@ -3415,7 +3416,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store i32 [[TMP20]], ptr [[TMP18]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i64 -1, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [6 x i32] poison, [10 x i32] [[TMP21]]), !continuation.registercount [[META33:![0-9]+]], !continuation.wait.await [[META13:![0-9]+]], !continuation.returnedRegistercount [[META33]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[TMP21]]), !waitmask [[META41:![0-9]+]], !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP22]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [10 x i32] [[TMP23]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4 @@ -3444,19 +3445,20 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP38]], i8 1 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[TMP36]], i64 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP42:%.*]] = extractelement <4 x float> [[TMP36]], i64 1 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP43:%.*]] = extractelement <4 x float> [[TMP36]], i64 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP44:%.*]] = extractelement <4 x float> [[TMP36]], i64 3 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP40]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP41]], float [[TMP42]], float [[TMP43]], float [[TMP44]], i8 15) ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]] -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 @@ -3524,7 +3526,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -3657,7 +3659,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META47:![0-9]+]] !continuation [[META48:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4 @@ -3680,7 +3682,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [20 x i32] poison, [30 x i32] [[TMP8]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [20 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0 @@ -3718,7 +3720,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META46]] !continuation [[META48:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META47]] !continuation [[META49:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4 @@ -3755,7 +3757,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP7:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [15 x i32] poison, [30 x i32] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]] +; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [15 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]] ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: store [30 x i32] [[TMP10]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0 @@ -3812,7 +3814,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader( -; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !lgc.cps [[META42]] !continuation [[META49:![0-9]+]] { +; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !lgc.cps [[META43]] !continuation [[META50:![0-9]+]] { ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [10 x i32], align 4 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8 @@ -3884,10 +3886,10 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @_cont_KernelEntry( -; CLEANUP-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] { +; CLEANUP-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] { ; CLEANUP-CPS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; CLEANUP-CPS-NEXT: call void @_AmdContStackSetPtr(i32 [[CSPINIT]]) -; CLEANUP-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 0, i32 poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison), !continuation.registercount [[META22]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 undef, [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison) ; CLEANUP-CPS-NEXT: ret void ; ; @@ -3906,14 +3908,14 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyRayGen( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0 ; CLEANUP-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; CLEANUP-CPS-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) -; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; CLEANUP-CPS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) +; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP4]]) ; CLEANUP-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0 ; CLEANUP-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -3938,7 +3940,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP8]], 7 ; CLEANUP-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8 ; CLEANUP-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9 -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i64 -1, i64 [[TMP6]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [6 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33:![0-9]+]], !continuation.wait.await [[META13:![0-9]+]], !continuation.returnedRegistercount [[META33]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 5, {} poison, i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !waitmask [[META38:![0-9]+]], !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]] ; CLEANUP-CPS-NEXT: unreachable ; ; @@ -3976,8 +3978,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[TMP14:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; CLEANUP-CPS-NEXT: [[TMP15:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP14]]) ; CLEANUP-CPS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP15]], i8 1 -; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) -; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP16]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) +; CLEANUP-CPS-NEXT: [[TMP16:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]]) +; CLEANUP-CPS-NEXT: [[TMP17:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP16]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) ; CLEANUP-CPS-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0 ; CLEANUP-CPS-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1 ; CLEANUP-CPS-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2 @@ -3987,7 +3989,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyClosestHitShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 @@ -4052,7 +4054,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyAnyHitShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8 ; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_HITDATA]], align 8 @@ -4281,7 +4283,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyIntersectionShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-CPS-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -4372,7 +4374,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT327]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: accepthit.i: ; CLEANUP-CPS-NEXT: [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0 @@ -4474,7 +4476,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META44]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META45]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 @@ -4611,7 +4613,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META46:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8) ; CLEANUP-CPS-NEXT: [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADERLARGEATTRS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0 @@ -4708,7 +4710,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; CLEANUP-CPS-NEXT: [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28 ; CLEANUP-CPS-NEXT: [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29 ; CLEANUP-CPS-NEXT: [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShaderLargeAttrs.resume.0) -; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]] +; CLEANUP-CPS-NEXT: call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP0]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]] ; CLEANUP-CPS-NEXT: unreachable ; CLEANUP-CPS: accepthit.i: ; CLEANUP-CPS-NEXT: [[TMP1:%.*]] = bitcast i32 100 to float @@ -4806,7 +4808,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0( -; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45]] { +; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META44]] !continuation [[META46]] { ; CLEANUP-CPS-NEXT: entryresume.0: ; CLEANUP-CPS-NEXT: [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8) ; CLEANUP-CPS-NEXT: [[TMP5:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2 @@ -4943,7 +4945,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; CLEANUP-CPS-LABEL: define void @MyMissShader( -; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META39]] !continuation [[META46:![0-9]+]] { +; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META40]] !continuation [[META47:![0-9]+]] { ; CLEANUP-CPS-NEXT: AllocaSpillBB: ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0 ; CLEANUP-CPS-NEXT: [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1 @@ -4990,32 +4992,32 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define i1 @_cont_IsEndSearch( -; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: [[ISEND:%.*]] = call i1 @opaqueIsEnd() ; DXILCONTPOSTPROCESS-CPS-NEXT: ret i1 [[ISEND]] ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes( -; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[VAL:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], ptr [[ADDR]], align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: ret [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]] ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @_cont_SetTriangleHitAttributes( -; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[VAL:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[VAL:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: [[ADDR:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-CPS-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL]], ptr [[ADDR]], align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: ret void ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define i32 @_cont_GetLocalRootIndex( -; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: ret i32 5 ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @_cont_KernelEntry( -; DXILCONTPOSTPROCESS-CPS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] { +; DXILCONTPOSTPROCESS-CPS-SAME: ) #[[ATTR1]] !lgc.rt.shaderstage [[META35:![0-9]+]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSP:%.*]] = alloca i32, align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32 ; DXILCONTPOSTPROCESS-CPS-NEXT: store i32 [[CSPINIT]], ptr [[CSP]], align 4 @@ -5025,7 +5027,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; ; ; DXILCONTPOSTPROCESS-CPS-LABEL: define %struct.HitData @_cont_GetCandidateState( -; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR0]] { +; DXILCONTPOSTPROCESS-CPS-SAME: ptr [[DATA:%.*]]) #[[ATTR1]] { ; DXILCONTPOSTPROCESS-CPS-NEXT: [[RESPTR:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA:%.*]], ptr [[DATA]], i32 0, i32 1 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[RES:%.*]] = load [[STRUCT_HITDATA:%.*]], ptr [[RESPTR]], align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: ret [[STRUCT_HITDATA]] [[RES]] @@ -5047,8 +5049,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-CPS-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP4:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP5:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP4]]) -; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP4]]) +; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP8:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP7]]) ; DXILCONTPOSTPROCESS-CPS-NEXT: [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0 @@ -5074,7 +5076,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-CPS-NEXT: [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP12]], 8 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP13]], 9 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4 -; DXILCONTPOSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.continue(i64 4, i32 [[TMP14]], i64 -1, i64 [[TMP9]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [6 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) +; DXILCONTPOSTPROCESS-CPS-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 4, i64 -1, i32 [[TMP14]], i64 [[TMP9]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]) ; DXILCONTPOSTPROCESS-CPS-NEXT: unreachable ; ; @@ -5114,8 +5116,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP15:%.*]] = getelementptr inbounds { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, ptr [[TMP4]], i32 0, i32 0 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP16:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[TMP15]]) ; DXILCONTPOSTPROCESS-CPS-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP16]], i8 1 -; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP17:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP12]]) -; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP18:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP17]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) +; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP17:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP12]]) +; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP18:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP17]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 }) ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1 ; DXILCONTPOSTPROCESS-CPS-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2 diff --git a/llvmraytracing/test/dx/traversal-empty-payload.ll b/llvmraytracing/test/dx/traversal-empty-payload.ll index 2e91b8b695..2e3a304308 100644 --- a/llvmraytracing/test/dx/traversal-empty-payload.ll +++ b/llvmraytracing/test/dx/traversal-empty-payload.ll @@ -16,9 +16,7 @@ declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) declare !pointeetys !6 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind) -declare void @lgc.ilcps.continue(...) - -declare void @lgc.ilcps.waitContinue(...) +declare void @lgc.cps.jump(...) declare i64 @lgc.cps.as.continuation.reference__i64(...) #3 @@ -35,12 +33,12 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage 6: ; preds = %0 %7 = load %struct.SystemData, ptr %5, align 4 %8 = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal) - call void (...) @lgc.ilcps.waitContinue(i64 1, i64 -1, i32 0, i64 %8, %struct.SystemData %7) + call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i64 %8, %struct.SystemData %7), !waitmask !9 unreachable 9: ; preds = %0 %10 = load %struct.SystemData, ptr %5, align 4 - call void (...) @lgc.ilcps.waitContinue(i64 0, i64 -1, i32 2, i64 poison, %struct.SystemData %10) + call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData %10), !waitmask !9 unreachable } @@ -57,6 +55,7 @@ attributes #2 = { nounwind } !6 = !{%struct.TraversalData poison} !7 = !{i32 6} !8 = !{i32 0} +!9 = !{i32 -1} ; EMPTYPAYLOAD-LABEL: define %struct.TraversalData @_cont_Traversal( ; EMPTYPAYLOAD-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [0 x i32] [[PADDING:%.*]], [0 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META3:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation [[META4:![0-9]+]] { ; EMPTYPAYLOAD-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_TRAVERSALDATA]], align 8 @@ -72,13 +71,11 @@ attributes #2 = { nounwind } ; EMPTYPAYLOAD: 7: ; EMPTYPAYLOAD-NEXT: [[TMP8:%.*]] = load [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP6]], align 4 ; EMPTYPAYLOAD-NEXT: [[TMP9:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal) -; EMPTYPAYLOAD-NEXT: [[TMP10:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; EMPTYPAYLOAD-NEXT: call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i64 [[TMP9]], [[STRUCT_SYSTEMDATA]] [[TMP8]]), !continuation.registercount [[META0]], !waitmask [[META5:![0-9]+]] +; EMPTYPAYLOAD-NEXT: call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i64 [[TMP9]], [[STRUCT_SYSTEMDATA]] [[TMP8]]), !waitmask [[META5:![0-9]+]], !continuation.registercount [[META0]] ; EMPTYPAYLOAD-NEXT: unreachable -; EMPTYPAYLOAD: 11: +; EMPTYPAYLOAD: 10: ; EMPTYPAYLOAD-NEXT: [[TMP13:%.*]] = load [[STRUCT_SYSTEMDATA]], ptr [[TMP6]], align 4 -; EMPTYPAYLOAD-NEXT: [[TMP14:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 -; EMPTYPAYLOAD-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA]] [[TMP13]]), !continuation.registercount [[META0]], !waitmask [[META5]] +; EMPTYPAYLOAD-NEXT: call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, [[STRUCT_SYSTEMDATA]] [[TMP13]]), !waitmask [[META5]], !continuation.registercount [[META0]] ; EMPTYPAYLOAD-NEXT: unreachable ; ; @@ -99,18 +96,12 @@ attributes #2 = { nounwind } ; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] poison, i32 [[DOTFCA_0_0_0_EXTRACT15]], 0, 0 ; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_1_INSERT19:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT]], float [[DOTFCA_0_1_EXTRACT16]], 1 ; EMPTYPAYLOAD-ALL-NEXT: [[TMP3:%.*]] = call i64 @continuation.getAddrAndMD(ptr @_cont_Traversal) -; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1 ; EMPTYPAYLOAD-ALL-NEXT: [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4 ; EMPTYPAYLOAD-ALL-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 1, i64 -1, i32 [[TMP6]], i64 [[TMP3]], [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT19]]) ; EMPTYPAYLOAD-ALL-NEXT: unreachable ; EMPTYPAYLOAD-ALL: 5: ; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_0_0_INSERT22:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT15]], 0, 0 ; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_1_INSERT25:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT22]], float [[DOTFCA_0_1_EXTRACT16]], 1 -; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_0_0_0_INSERT6:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_0_1_INSERT9:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT6]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; EMPTYPAYLOAD-ALL-NEXT: [[DOTFCA_1_INSERT12:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT9]], i32 [[DOTFCA_1_EXTRACT]], 1 ; EMPTYPAYLOAD-ALL-NEXT: [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4 ; EMPTYPAYLOAD-ALL-NEXT: call void (...) @lgc.ilcps.waitContinue(i64 0, i64 -1, i32 [[TMP10]], i64 poison, [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT25]]) ; EMPTYPAYLOAD-ALL-NEXT: unreachable diff --git a/llvmraytracing/test/dx/traversal-passthrough-payload.ll b/llvmraytracing/test/dx/traversal-passthrough-payload.ll index 1d11e94765..6d75c1ba92 100644 --- a/llvmraytracing/test/dx/traversal-passthrough-payload.ll +++ b/llvmraytracing/test/dx/traversal-passthrough-payload.ll @@ -16,9 +16,7 @@ declare !pointeetys !4 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) declare !pointeetys !6 i1 @_cont_ReportHit(%struct.TraversalData* %data, float %t, i32 %hitKind) -declare void @lgc.ilcps.continue(...) - -declare void @lgc.ilcps.waitContinue(...) +declare void @lgc.cps.jump(...) declare i64 @lgc.cps.as.continuation.reference__i64(...) #3 @@ -35,12 +33,12 @@ define void @_cont_Traversal(%struct.TraversalData %data) #1 !lgc.rt.shaderstage 6: ; preds = %0 %7 = load %struct.SystemData, ptr %5, align 4 %8 = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal) - call void (...) @lgc.ilcps.waitContinue(i64 1, i64 -1, i32 0, i64 %8, %struct.SystemData %7) + call void (...) @lgc.cps.jump(i64 1, i32 -1, {} poison, i64 %8, %struct.SystemData %7), !waitmask !9 unreachable 9: ; preds = %0 %10 = load %struct.SystemData, ptr %5, align 4 - call void (...) @lgc.ilcps.waitContinue(i64 0, i64 -1, i32 2, i64 poison, %struct.SystemData %10) + call void (...) @lgc.cps.jump(i64 0, i32 -1, {} poison, i64 poison, %struct.SystemData %10), !waitmask !9 unreachable } @@ -57,6 +55,7 @@ attributes #2 = { nounwind } !6 = !{%struct.TraversalData poison} !7 = !{i32 6} !8 = !{i32 4} ; PRESERVED_REGCOUNT +!9 = !{i32 -1} ; MAXPAYLOADSIZE-LABEL: define void @_cont_Traversal( ; MAXPAYLOADSIZE-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META2:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.state [[META4:![0-9]+]] { ; MAXPAYLOADSIZE-NEXT: AllocaSpillBB: @@ -104,9 +103,6 @@ attributes #2 = { nounwind } ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] poison, i32 [[DOTFCA_0_0_0_EXTRACT136]], 0, 0 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT140:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT]], float [[DOTFCA_0_1_EXTRACT137]], 1 ; MAXPAYLOADSIZE-NEXT: [[TMP3:%.*]] = call i64 @continuation.getAddrAndMD(ptr @_cont_Traversal) -; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; MAXPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT124:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 @@ -143,9 +139,6 @@ attributes #2 = { nounwind } ; MAXPAYLOADSIZE: 5: ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_0_INSERT143:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT136]], 0, 0 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT146:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT143]], float [[DOTFCA_0_1_EXTRACT137]], 1 -; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_0_0_INSERT127:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_1_INSERT130:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT127]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; MAXPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT133:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT130]], i32 [[DOTFCA_1_EXTRACT]], 1 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_0_INSERT3:%.*]] = insertvalue [30 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT6:%.*]] = insertvalue [30 x i32] [[DOTFCA_0_INSERT3]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; MAXPAYLOADSIZE-NEXT: [[DOTFCA_2_INSERT9:%.*]] = insertvalue [30 x i32] [[DOTFCA_1_INSERT6]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 @@ -202,9 +195,6 @@ attributes #2 = { nounwind } ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] poison, i32 [[DOTFCA_0_0_0_EXTRACT32]], 0, 0 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT36:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT]], float [[DOTFCA_0_1_EXTRACT33]], 1 ; PRESERVEDPAYLOADSIZE-NEXT: [[TMP3:%.*]] = call i64 @continuation.getAddrAndMD(ptr @_cont_Traversal) -; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT20:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT]], i32 [[DOTFCA_1_EXTRACT]], 1 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 @@ -215,9 +205,6 @@ attributes #2 = { nounwind } ; PRESERVEDPAYLOADSIZE: 5: ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_0_INSERT39:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT32]], 0, 0 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT42:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT39]], float [[DOTFCA_0_1_EXTRACT33]], 1 -; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_0_0_INSERT23:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, i32 [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0 -; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_1_INSERT26:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_0_0_INSERT23]], float [[DOTFCA_0_1_EXTRACT]], 0, 1 -; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT29:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_0_1_INSERT26]], i32 [[DOTFCA_1_EXTRACT]], 1 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_0_INSERT3:%.*]] = insertvalue [4 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_1_INSERT6:%.*]] = insertvalue [4 x i32] [[DOTFCA_0_INSERT3]], i32 [[PAYLOAD_FCA_1_EXTRACT]], 1 ; PRESERVEDPAYLOADSIZE-NEXT: [[DOTFCA_2_INSERT9:%.*]] = insertvalue [4 x i32] [[DOTFCA_1_INSERT6]], i32 [[PAYLOAD_FCA_2_EXTRACT]], 2 diff --git a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll b/llvmraytracing/test/dx/unnamed-type-intrinsics.ll index 2af4c8002b..65e1950b5e 100644 --- a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll +++ b/llvmraytracing/test/dx/unnamed-type-intrinsics.ll @@ -358,8 +358,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]] ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA22:![0-9]+]] -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]]) ; LOWERRAYTRACINGPIPELINE-NEXT: [[DIS_DATA_I:%.*]] = load [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[SYS_DATA_I:%.*]] = insertvalue [[TMP2]] undef, [[TMP0]] [[DIS_DATA_I]], 0 @@ -412,15 +412,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP50]], i8 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP51:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index() ; LOWERRAYTRACINGPIPELINE-NEXT: [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP51]], i8 1 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP53:%.*]] = call [[DX_TYPES_HANDLE]] @[[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP54:%.*]] = extractelement <4 x float> [[TMP49]], i64 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP55:%.*]] = extractelement <4 x float> [[TMP49]], i64 1 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP56:%.*]] = extractelement <4 x float> [[TMP49]], i64 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP57:%.*]] = extractelement <4 x float> [[TMP49]], i64 3 ; LOWERRAYTRACINGPIPELINE-NEXT: call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP53]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP54]], float [[TMP55]], float [[TMP56]], float [[TMP57]], i8 15) ; LOWERRAYTRACINGPIPELINE-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]] -; LOWERRAYTRACINGPIPELINE-NEXT: ret void +; LOWERRAYTRACINGPIPELINE-NEXT: call void @lgc.cps.complete() +; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; ; ; LOWERRAYTRACINGPIPELINE-LABEL: define %0 @MyClosestHit( @@ -449,7 +450,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP20]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP16]], ptr [[TMP15]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) +; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP17:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] @[[_CONT_GETTRIANGLEHITATTRIBUTES:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]]) ; LOWERRAYTRACINGPIPELINE-NEXT: store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP17]], ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP2]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: store i32 [[TMP18]], ptr [[HITATTRS]], align 4 @@ -488,6 +489,6 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[TMP2]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP47:%.*]] = load [[TMP0]], ptr [[TMP46]], align 4 ; LOWERRAYTRACINGPIPELINE-NEXT: [[TMP49:%.*]] = load [10 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[TMP0]] [[TMP47]], [33 x i32] poison, [10 x i32] [[TMP49]]), !continuation.registercount [[META18]] +; LOWERRAYTRACINGPIPELINE-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[TMP0]] [[TMP47]], [33 x i32] poison, [10 x i32] [[TMP49]]), !continuation.registercount [[META18]] ; LOWERRAYTRACINGPIPELINE-NEXT: unreachable ; diff --git a/llvmraytracing/test/intrinsics/get-func-addr.ll b/llvmraytracing/test/intrinsics/get-func-addr.ll index 2a7fc4294e..bc6f45ca31 100644 --- a/llvmraytracing/test/intrinsics/get-func-addr.ll +++ b/llvmraytracing/test/intrinsics/get-func-addr.ll @@ -16,7 +16,7 @@ define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwin define { i64, i32 } @main() !lgc.rt.shaderstage !10 { ; CHECK-LABEL: define void @main -; CHECK-SAME: (i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] !continuation [[META12:![0-9]+]] { +; CHECK-SAME: (i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META10:![0-9]+]] !continuation.registercount [[META5]] !continuation [[META11:![0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8 ; CHECK-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [0 x i32], align 4 @@ -24,7 +24,8 @@ define { i64, i32 } @main() !lgc.rt.shaderstage !10 { ; CHECK-NEXT: call void @amd.dx.setLocalRootIndex(i32 0) ; CHECK-NEXT: [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyFunc) ; CHECK-NEXT: [[V0:%.*]] = insertvalue { i64, i32 } undef, i64 [[TMP1]], 0 -; CHECK-NEXT: ret void +; CHECK-NEXT: call void @lgc.cps.complete() +; CHECK-NEXT: unreachable ; entry: %val = call i64 @_AmdGetFuncAddrMyFunc() diff --git a/llvmraytracing/test/intrinsics/shader-start.ll b/llvmraytracing/test/intrinsics/shader-start.ll index d2f50b0b25..e46ed8bd77 100644 --- a/llvmraytracing/test/intrinsics/shader-start.ll +++ b/llvmraytracing/test/intrinsics/shader-start.ll @@ -21,7 +21,7 @@ define void @main() !lgc.rt.shaderstage !10 { ; CHECK-NEXT: store i32 123, ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 -; CHECK-NEXT: call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], [8 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0]] +; CHECK-NEXT: call void (...) @lgc.cps.jump(i64 [[RETURNADDR]], i32 -1, {} poison, i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], [8 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0]] ; CHECK-NEXT: unreachable ; entry: diff --git a/llvmraytracing/test/lgccps/entry-point-with-cps.ll b/llvmraytracing/test/lgccps/entry-point-with-cps.ll index bbdfce955c..02f18af2e8 100644 --- a/llvmraytracing/test/lgccps/entry-point-with-cps.ll +++ b/llvmraytracing/test/lgccps/entry-point-with-cps.ll @@ -6,6 +6,8 @@ ; Details of the output are likely to differ from the final production pass, ; especially instruction order and value names. +declare void @lgc.cps.complete() + define spir_func void @raygen({} %state, i32 %rcr) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 0} { %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0) %fn = load ptr, ptr addrspace(4) %pushconst @@ -21,7 +23,8 @@ define spir_func void @raygen({} %state, i32 %rcr) !lgc.shaderstage !{i32 7} !lg store [2 x i32] %r, ptr addrspace(1) %dst ; Note: RGS returns, meaning end of thread. - ret void + call void @lgc.cps.complete() + unreachable } define spir_func void @chs({} %state, i32 %rcr, i32 %x) !lgc.shaderstage !{i32 7} !lgc.cps !{i32 1} { @@ -54,7 +57,8 @@ main: exit: ; Note: Entry kernel also returns - ret void + call void @lgc.cps.complete() + unreachable } declare ptr addrspace(4) @lgc.user.data(i32) @@ -152,7 +156,7 @@ declare void @lgc.cps.jump(...) ; LOWER-AWAIT-NEXT: [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]]) ; LOWER-AWAIT-NEXT: [[TMP7:%.*]] = call [2 x i32] @lgc.ilcps.getReturnValue__a2i32() ; LOWER-AWAIT-NEXT: store [2 x i32] [[TMP7]], ptr addrspace(1) [[DST]], align 4 -; LOWER-AWAIT-NEXT: call void (...) @lgc.ilcps.return(i32 poison) +; LOWER-AWAIT-NEXT: call void @lgc.cps.complete() ; LOWER-AWAIT-NEXT: unreachable ; ; @@ -190,6 +194,6 @@ declare void @lgc.cps.jump(...) ; LOWER-AWAIT-NEXT: [[TMP5:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP4]]) ; LOWER-AWAIT-NEXT: br label [[EXIT]] ; LOWER-AWAIT: exit: -; LOWER-AWAIT-NEXT: call void (...) @lgc.ilcps.return(i32 poison) +; LOWER-AWAIT-NEXT: call void @lgc.cps.complete() ; LOWER-AWAIT-NEXT: unreachable ; diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll new file mode 100644 index 0000000000..0ce825d1c1 --- /dev/null +++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-get-i32.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck %s + +%struct.AnyHitTraversalData = type { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } +%struct.DispatchSystemData = type { i32 } + +; Need _cont_ReportHit to get system data type +declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) + +declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) + +declare i64 @_AmdGetCurrentFuncAddr() +declare i32 @_AmdContPayloadRegistersGetI32(i32) + +@debug_global = external global i32 + +define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !3 { +; CHECK-LABEL: define dso_local spir_func void @_cont_Traversal( +; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] { +; CHECK-NEXT: .entry: +; CHECK-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { i32 } }, align 8, addrspace(5) +; CHECK-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4 +; CHECK-NEXT: store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-NEXT: store { { i32 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 2 +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[TMP0]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr @debug_global, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5, [42 x i32] poison, [30 x i32] [[TMP2]]), !continuation.registercount [[META0:![0-9]+]] +; CHECK-NEXT: unreachable +; +.entry: + %val = call i32 @_AmdContPayloadRegistersGetI32(i32 2) + store i32 %val, i32* @debug_global, align 4 + call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5) + unreachable +} + +declare void @lgc.cps.jump(...) local_unnamed_addr + +!lgc.cps.module = !{} + +!0 = !{i32 7} +!1 = !{ { { i32 } } poison} +!3 = !{i32 6} +!5 = !{i32 0, %struct.AnyHitTraversalData poison} +!6 = !{ %struct.AnyHitTraversalData poison} +!7 = !{i32 8} +!9 = !{i32 0, %struct.DispatchSystemData poison} +!10 = !{%struct.DispatchSystemData poison} diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll new file mode 100644 index 0000000000..c0b0673fdb --- /dev/null +++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-i32-count.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck %s + +%struct.AnyHitTraversalData = type { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } +%struct.DispatchSystemData = type { i32 } + +; Need _cont_ReportHit to get system data type +declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) + +declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) + +declare i64 @_AmdGetCurrentFuncAddr() +declare i32 @_AmdContPayloadRegistersI32Count() + +@debug_global = external global i32 + +define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !3 { +; CHECK-LABEL: define dso_local spir_func void @_cont_Traversal( +; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [11 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] { +; CHECK-NEXT: .entry: +; CHECK-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { i32 } }, align 8, addrspace(5) +; CHECK-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [11 x i32], align 4 +; CHECK-NEXT: store [11 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-NEXT: store { { i32 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4 +; CHECK-NEXT: store i32 11, ptr @debug_global, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load [11 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5, [42 x i32] poison, [11 x i32] [[TMP0]]), !continuation.registercount [[META1:![0-9]+]] +; CHECK-NEXT: unreachable +; +.entry: + %val = call i32 @_AmdContPayloadRegistersI32Count() + store i32 %val, i32* @debug_global, align 4 + call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5) + unreachable +} + +declare void @lgc.cps.jump(...) local_unnamed_addr + +!lgc.cps.module = !{} +!continuation.maxPayloadRegisterCount = !{!11} +!continuation.preservedPayloadRegisterCount = !{!12} + +!0 = !{i32 7} +!1 = !{ { { i32 } } poison} +!3 = !{i32 6} +!5 = !{i32 0, %struct.AnyHitTraversalData poison} +!6 = !{ %struct.AnyHitTraversalData poison} +!7 = !{i32 8} +!9 = !{i32 0, %struct.DispatchSystemData poison} +!10 = !{%struct.DispatchSystemData poison} +!11 = !{i32 15} +!12 = !{i32 11} diff --git a/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll new file mode 100644 index 0000000000..8e8fbf3034 --- /dev/null +++ b/llvmraytracing/test/lgccps/intrinsics/cont-payload-registers-set-i32.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck %s + +%struct.AnyHitTraversalData = type { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } +%struct.DispatchSystemData = type { i32 } + +; Need _cont_ReportHit to get system data type +declare !pointeetys !6 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) + +declare !pointeetys !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) + +declare i64 @_AmdGetCurrentFuncAddr() +declare void @_AmdContPayloadRegistersSetI32(i32, i32) + +define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !pointeetys !1 !lgc.rt.shaderstage !3 { +; CHECK-LABEL: define dso_local spir_func void @_cont_Traversal( +; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { i32 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] { +; CHECK-NEXT: .entry: +; CHECK-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { i32 } }, align 8, addrspace(5) +; CHECK-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [30 x i32], align 4 +; CHECK-NEXT: store [30 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-NEXT: store { { i32 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], i32 0, i32 3 +; CHECK-NEXT: store i32 42, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load [30 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-NEXT: call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5, [42 x i32] poison, [30 x i32] [[TMP1]]), !continuation.registercount [[META0:![0-9]+]] +; CHECK-NEXT: unreachable +; +.entry: + call void @_AmdContPayloadRegistersSetI32(i32 3, i32 42) + call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i32 poison, i32 5) + unreachable +} + +declare void @lgc.cps.jump(...) local_unnamed_addr + +!lgc.cps.module = !{} + +!0 = !{i32 7} +!1 = !{ { { i32 } } poison} +!3 = !{i32 6} +!5 = !{i32 0, %struct.AnyHitTraversalData poison} +!6 = !{ %struct.AnyHitTraversalData poison} +!7 = !{i32 8} +!9 = !{i32 0, %struct.DispatchSystemData poison} +!10 = !{%struct.DispatchSystemData poison} diff --git a/llvmraytracing/test/lgccps/lower-traversal.ll b/llvmraytracing/test/lgccps/lower-traversal.ll index 0614d85f58..0d6602dadf 100644 --- a/llvmraytracing/test/lgccps/lower-traversal.ll +++ b/llvmraytracing/test/lgccps/lower-traversal.ll @@ -18,6 +18,8 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-16-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [7 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META5:![0-9]+]] !lgc.rt.shaderstage [[META6:![0-9]+]] !lgc.cps [[META7:![0-9]+]] !continuation [[META8:![0-9]+]] { ; CHECK-ATTRSIZE-16-NEXT: .entry: ; CHECK-ATTRSIZE-16-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5) +; CHECK-ATTRSIZE-16-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [8 x i32], align 4 +; CHECK-ATTRSIZE-16-NEXT: store [8 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; CHECK-ATTRSIZE-16-NEXT: store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16 ; CHECK-ATTRSIZE-16-NEXT: [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0) ; CHECK-ATTRSIZE-16-NEXT: [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]]) @@ -120,9 +122,10 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_2_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT]], i32 [[TMP40]], 2, 6 ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7 ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8 -; CHECK-ATTRSIZE-16-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [7 x i32] poison, [8 x i32] [[PAYLOAD]]) +; CHECK-ATTRSIZE-16-NEXT: [[TMP109:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-ATTRSIZE-16-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [7 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0:![0-9]+]] ; CHECK-ATTRSIZE-16-NEXT: unreachable -; CHECK-ATTRSIZE-16: 67: +; CHECK-ATTRSIZE-16: 68: ; CHECK-ATTRSIZE-16-NEXT: [[TMP68:%.*]] = shl i32 [[DOTFR]], 3 ; CHECK-ATTRSIZE-16-NEXT: [[TMP69:%.*]] = and i32 [[TMP68]], -64 ; CHECK-ATTRSIZE-16-NEXT: [[TMP70:%.*]] = zext i32 [[TMP69]] to i64 @@ -151,7 +154,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-16-NEXT: [[DOTFR537:%.*]] = freeze i64 [[TMP91]] ; CHECK-ATTRSIZE-16-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR537]], 0 ; CHECK-ATTRSIZE-16-NEXT: br i1 [[DOTNOT]], label [[DOTEXIT5:%.*]], label [[TMP92:%.*]] -; CHECK-ATTRSIZE-16: 92: +; CHECK-ATTRSIZE-16: 93: ; CHECK-ATTRSIZE-16-NEXT: [[TMP93:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 11 ; CHECK-ATTRSIZE-16-NEXT: [[TMP94:%.*]] = load i32, ptr addrspace(7) [[TMP93]], align 4 ; CHECK-ATTRSIZE-16-NEXT: [[TMP95:%.*]] = mul i32 [[TMP94]], [[TMP83]] @@ -170,7 +173,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-16-NEXT: [[DOTNOT540:%.*]] = icmp eq i32 [[DOTSROA_0150_0_VEC_EXTRACT]], 0 ; CHECK-ATTRSIZE-16-NEXT: [[OR_COND:%.*]] = or i1 [[TMP103]], [[DOTNOT540]] ; CHECK-ATTRSIZE-16-NEXT: br i1 [[OR_COND]], label [[TMP107]], label [[TMP104:%.*]] -; CHECK-ATTRSIZE-16: 104: +; CHECK-ATTRSIZE-16: 105: ; CHECK-ATTRSIZE-16-NEXT: [[TMP105:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @_cont_Traversal) ; CHECK-ATTRSIZE-16-NEXT: [[TMP106:%.*]] = zext i32 [[TMP105]] to i64 ; CHECK-ATTRSIZE-16-NEXT: [[DOTSROA_0320_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP106]] to i32 @@ -196,20 +199,24 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_2_6_INSERT341:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT340]], i32 [[TMP40]], 2, 6 ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7 ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8 -; CHECK-ATTRSIZE-16-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [7 x i32] poison, [8 x i32] [[PAYLOAD]]) +; CHECK-ATTRSIZE-16-NEXT: [[TMP108:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-ATTRSIZE-16-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [7 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]] ; CHECK-ATTRSIZE-16-NEXT: unreachable -; CHECK-ATTRSIZE-16: 107: +; CHECK-ATTRSIZE-16: 109: ; CHECK-ATTRSIZE-16-NEXT: [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ] ; CHECK-ATTRSIZE-16-NEXT: [[DOTSROA_0373_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32 ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0 ; CHECK-ATTRSIZE-16-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1 -; CHECK-ATTRSIZE-16-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [34 x i32] poison, [8 x i32] [[PAYLOAD]]) +; CHECK-ATTRSIZE-16-NEXT: [[TMP110:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-ATTRSIZE-16-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [34 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]] ; CHECK-ATTRSIZE-16-NEXT: unreachable ; ; CHECK-ATTRSIZE-8-LABEL: define dso_local spir_func void @_cont_Traversal( ; CHECK-ATTRSIZE-8-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [5 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META6:![0-9]+]] !continuation [[META7:![0-9]+]] { ; CHECK-ATTRSIZE-8-NEXT: .entry: ; CHECK-ATTRSIZE-8-NEXT: [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5) +; CHECK-ATTRSIZE-8-NEXT: [[PAYLOAD_SERIALIZATION_ALLOCA:%.*]] = alloca [8 x i32], align 4 +; CHECK-ATTRSIZE-8-NEXT: store [8 x i32] [[PAYLOAD]], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 ; CHECK-ATTRSIZE-8-NEXT: store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16 ; CHECK-ATTRSIZE-8-NEXT: [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0) ; CHECK-ATTRSIZE-8-NEXT: [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]]) @@ -312,9 +319,10 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_2_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT]], i32 [[TMP40]], 2, 6 ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7 ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8 -; CHECK-ATTRSIZE-8-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [5 x i32] poison, [8 x i32] [[PAYLOAD]]) +; CHECK-ATTRSIZE-8-NEXT: [[TMP109:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-ATTRSIZE-8-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [5 x i32] poison, [8 x i32] [[TMP109]]), !continuation.registercount [[META0:![0-9]+]] ; CHECK-ATTRSIZE-8-NEXT: unreachable -; CHECK-ATTRSIZE-8: 67: +; CHECK-ATTRSIZE-8: 68: ; CHECK-ATTRSIZE-8-NEXT: [[TMP68:%.*]] = shl i32 [[DOTFR]], 3 ; CHECK-ATTRSIZE-8-NEXT: [[TMP69:%.*]] = and i32 [[TMP68]], -64 ; CHECK-ATTRSIZE-8-NEXT: [[TMP70:%.*]] = zext i32 [[TMP69]] to i64 @@ -343,7 +351,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-8-NEXT: [[DOTFR537:%.*]] = freeze i64 [[TMP91]] ; CHECK-ATTRSIZE-8-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR537]], 0 ; CHECK-ATTRSIZE-8-NEXT: br i1 [[DOTNOT]], label [[DOTEXIT5:%.*]], label [[TMP92:%.*]] -; CHECK-ATTRSIZE-8: 92: +; CHECK-ATTRSIZE-8: 93: ; CHECK-ATTRSIZE-8-NEXT: [[TMP93:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 11 ; CHECK-ATTRSIZE-8-NEXT: [[TMP94:%.*]] = load i32, ptr addrspace(7) [[TMP93]], align 4 ; CHECK-ATTRSIZE-8-NEXT: [[TMP95:%.*]] = mul i32 [[TMP94]], [[TMP83]] @@ -362,7 +370,7 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-8-NEXT: [[DOTNOT540:%.*]] = icmp eq i32 [[DOTSROA_0150_0_VEC_EXTRACT]], 0 ; CHECK-ATTRSIZE-8-NEXT: [[OR_COND:%.*]] = or i1 [[TMP103]], [[DOTNOT540]] ; CHECK-ATTRSIZE-8-NEXT: br i1 [[OR_COND]], label [[TMP107]], label [[TMP104:%.*]] -; CHECK-ATTRSIZE-8: 104: +; CHECK-ATTRSIZE-8: 105: ; CHECK-ATTRSIZE-8-NEXT: [[TMP105:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @_cont_Traversal) ; CHECK-ATTRSIZE-8-NEXT: [[TMP106:%.*]] = zext i32 [[TMP105]] to i64 ; CHECK-ATTRSIZE-8-NEXT: [[DOTSROA_0320_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP106]] to i32 @@ -388,14 +396,16 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @ ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_2_6_INSERT341:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT340]], i32 [[TMP40]], 2, 6 ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7 ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8 -; CHECK-ATTRSIZE-8-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [5 x i32] poison, [8 x i32] [[PAYLOAD]]) +; CHECK-ATTRSIZE-8-NEXT: [[TMP108:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-ATTRSIZE-8-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [5 x i32] poison, [8 x i32] [[TMP108]]), !continuation.registercount [[META0]] ; CHECK-ATTRSIZE-8-NEXT: unreachable -; CHECK-ATTRSIZE-8: 107: +; CHECK-ATTRSIZE-8: 109: ; CHECK-ATTRSIZE-8-NEXT: [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ] ; CHECK-ATTRSIZE-8-NEXT: [[DOTSROA_0373_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32 ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0 ; CHECK-ATTRSIZE-8-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1 -; CHECK-ATTRSIZE-8-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [32 x i32] poison, [8 x i32] [[PAYLOAD]]) +; CHECK-ATTRSIZE-8-NEXT: [[TMP110:%.*]] = load [8 x i32], ptr [[PAYLOAD_SERIALIZATION_ALLOCA]], align 4 +; CHECK-ATTRSIZE-8-NEXT: call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [32 x i32] poison, [8 x i32] [[TMP110]]), !continuation.registercount [[META0]] ; CHECK-ATTRSIZE-8-NEXT: unreachable ; .entry: diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp index 921438cd41..2a1ca060d5 100644 --- a/tool/dumper/vkgcPipelineDumper.cpp +++ b/tool/dumper/vkgcPipelineDumper.cpp @@ -1018,6 +1018,7 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe dumpFile << "samplePatternIdx = " << pipelineInfo->rsState.samplePatternIdx << "\n"; dumpFile << "dynamicSampleInfo = " << pipelineInfo->rsState.dynamicSampleInfo << "\n"; dumpFile << "rasterStream = " << pipelineInfo->rsState.rasterStream << "\n"; + dumpFile << "enableMapClipDistMask = " << pipelineInfo->glState.enableMapClipDistMask << "\n"; dumpFile << "usrClipPlaneMask = " << static_cast(pipelineInfo->rsState.usrClipPlaneMask) << "\n"; dumpFile << "alphaToCoverageEnable = " << pipelineInfo->cbState.alphaToCoverageEnable << "\n"; dumpFile << "dualSourceBlendEnable = " << pipelineInfo->cbState.dualSourceBlendEnable << "\n"; @@ -1058,6 +1059,7 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe dumpFile << "enableColorClampVs = " << pipelineInfo->glState.enableColorClampVs << "\n"; dumpFile << "enableColorClampFs = " << pipelineInfo->glState.enableColorClampFs << "\n"; dumpFile << "enableFlatShade = " << pipelineInfo->glState.enableFlatShade << "\n"; + dumpFile << "alphaTestFunc = " << pipelineInfo->glState.alphaTestFunc << "\n"; dumpFile << "originUpperLeft = " << pipelineInfo->getGlState().originUpperLeft << "\n"; if (pipelineInfo->clientMetadataSize > 0) { @@ -1085,9 +1087,6 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe } dumpFile << "forceDisableStreamOut = " << pipelineInfo->getGlState().apiXfbOutData.forceDisableStreamOut << "\n"; -#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 70 - dumpFile << "forceEnablePrimStats = " << pipelineInfo->apiXfbOutData.forceEnablePrimStats << "\n"; -#endif const auto pXfbOutInfos = pipelineInfo->getGlState().apiXfbOutData.pXfbOutInfos; for (unsigned idx = 0; idx < pipelineInfo->getGlState().apiXfbOutData.numXfbOutInfo; ++idx) { dumpFile << "xfbOutInfo[" << idx << "].isBuiltIn = " << pXfbOutInfos[idx].isBuiltIn << "\n"; @@ -1100,6 +1099,7 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe } dumpFile << "vbAddressLowBitsKnown = " << pipelineInfo->getGlState().vbAddressLowBitsKnown << "\n"; dumpFile << "advancedBlendInfo.enableAdvancedBlend = " << pipelineInfo->advancedBlendInfo.enableAdvancedBlend << "\n"; + dumpFile << "advancedBlendInfo.enableRov = " << pipelineInfo->advancedBlendInfo.enableRov << "\n"; dumpFile << "advancedBlendInfo.binding = " << pipelineInfo->advancedBlendInfo.binding << "\n"; dumpPipelineOptions(&pipelineInfo->options, dumpFile); @@ -1581,11 +1581,14 @@ MetroHash::Hash PipelineDumper::generateHashForGraphicsPipeline(const GraphicsPi } hasher.Update(pipeline->advancedBlendInfo.enableAdvancedBlend); + hasher.Update(pipeline->advancedBlendInfo.enableRov); hasher.Update(pipeline->advancedBlendInfo.binding); hasher.Update(pipeline->glState.enableColorClampVs); hasher.Update(pipeline->glState.enableColorClampFs); hasher.Update(pipeline->glState.enableFlatShade); + hasher.Update(pipeline->glState.enableMapClipDistMask); + hasher.Update(pipeline->glState.alphaTestFunc); MetroHash::Hash hash = {}; hasher.Finalize(hash.bytes); @@ -1809,9 +1812,6 @@ void PipelineDumper::updateHashForNonFragmentState(const GraphicsPipelineBuildIn hasher->Update(*pipeline->iaState.tessLevel); hasher->Update(pipeline->getGlState().apiXfbOutData.forceDisableStreamOut); -#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 70 - hasher->Update(pipeline->apiXfbOutData.forceEnablePrimStats); -#endif for (unsigned i = 0; i < pipeline->getGlState().apiXfbOutData.numXfbOutInfo; i++) { hasher->Update(pipeline->getGlState().apiXfbOutData.pXfbOutInfos[i].isBuiltIn); hasher->Update(pipeline->getGlState().apiXfbOutData.pXfbOutInfos[i].location); @@ -1932,10 +1932,9 @@ void PipelineDumper::updateHashForPipelineShaderInfo(ShaderStage stage, const Pi hasher->Update(shaderInfo->options.clientHash); } else { const ShaderModuleData *moduleData = reinterpret_cast(shaderInfo->pModuleData); - hasher->Update(stage); if (isCacheHash) { hasher->Update(static_cast(voidPtrInc(moduleData, ShaderModuleCacheHashOffset)), - sizeof(moduleData->hash)); + sizeof(moduleData->cacheHash)); } else hasher->Update(moduleData->hash); } diff --git a/tool/vfx/vfxVkSection.h b/tool/vfx/vfxVkSection.h index 6c65f74e5c..acdf016c59 100644 --- a/tool/vfx/vfxVkSection.h +++ b/tool/vfx/vfxVkSection.h @@ -871,6 +871,7 @@ class SectionAdvancedBlendInfo : public Section { static std::vector addrTable = []() { std::vector addrTableInitializer; INIT_STATE_MEMBER_NAME_TO_ADDR(SectionAdvancedBlendInfo, enableAdvancedBlend, MemberTypeBool, false); + INIT_STATE_MEMBER_NAME_TO_ADDR(SectionAdvancedBlendInfo, enableRov, MemberTypeBool, false); INIT_STATE_MEMBER_NAME_TO_ADDR(SectionAdvancedBlendInfo, binding, MemberTypeInt, false); return addrTableInitializer; }(); @@ -939,6 +940,8 @@ class SectionGraphicsState : public Section { INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableColorClampVs, MemberTypeBool, false); INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableColorClampFs, MemberTypeBool, false); INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableFlatShade, MemberTypeBool, false); + INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableMapClipDistMask, MemberTypeBool, false); + INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, alphaTestFunc, MemberTypeInt, false); INIT_MEMBER_ARRAY_NAME_TO_ADDR(SectionGraphicsState, m_colorBuffer, MemberTypeColorBufferItem, Vkgc::MaxColorTargets, true); @@ -965,10 +968,6 @@ class SectionGraphicsState : public Section { INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, vbAddressLowBitsKnown, MemberTypeBool, false); #endif INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_forceDisableStreamOut, MemberTypeBool, false); -#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 70 - INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, apiXfbOutData, forceEnablePrimStats, MemberTypeBool, - false); -#endif INIT_MEMBER_DYNARRAY_NAME_TO_ADDR(SectionGraphicsState, m_xfbOutInfo, MemberTypeXfbOutInfo, true); INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_advancedBlendInfo, MemberTypeAdvancedBlendInfo, true); return addrTableInitializer; diff --git a/util/gpurtshim/GpurtShim.cpp b/util/gpurtshim/GpurtShim.cpp index e08a889910..6e77b2c047 100644 --- a/util/gpurtshim/GpurtShim.cpp +++ b/util/gpurtshim/GpurtShim.cpp @@ -31,8 +31,8 @@ *********************************************************************************************************************** */ -#include "gpurt/gpurt.h" #include "vkgcGpurtShim.h" +#include "gpurt/gpurt.h" #include using namespace Vkgc; diff --git a/version/CMakeLists.txt b/version/CMakeLists.txt index 53a57b6881..23bb621cd3 100644 --- a/version/CMakeLists.txt +++ b/version/CMakeLists.txt @@ -118,3 +118,15 @@ if (LLPC_BUILD_PHOENIX2) target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_PHOENIX2 CHIP_HDR_PHOENIX2) endif() #endif +#if VKI_BUILD_STRIX1 +option(LLPC_BUILD_STRIX1 "LLPC support for STRIX1?" ON) +if (LLPC_BUILD_STRIX1) + target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_STRIX1 CHIP_HDR_STRIX1) +endif() +#endif +#if VKI_BUILD_GFX115 +option(LLPC_BUILD_GFX115 "LLPC support for GFX11.5?" ON) +if (LLPC_BUILD_GFX115) + target_compile_definitions(llpc_version INTERFACE LLPC_BUILD_GFX115 CHIP_HDR_GFX115) +endif() +#endif diff --git a/version/include/llpc/GpurtIntrinsics.h b/version/include/llpc/GpurtIntrinsics.h index 90072cc393..54dbb71660 100644 --- a/version/include/llpc/GpurtIntrinsics.h +++ b/version/include/llpc/GpurtIntrinsics.h @@ -246,3 +246,6 @@ GPURT_DECL bool _AmdContinuationStackIsGlobal() DUMMY_GENERIC_FUNC(0) // Intrinsic to get the current rtip version. // The version is encoded as in decimal digits, so 11 is rtip 1.1, 20 is rtip 2.0 GPURT_DECL RayTracingIpLevel _AmdGetRtip() DUMMY_GENERIC_FUNC(RayTracingIpLevel::_None) +//===================================================================================================================== +// Intrinsic that returns whether GPURT is compiled from a LLPC-based build or not. +GPURT_DECL bool _AmdIsLlpc() DUMMY_GENERIC_FUNC(0) diff --git a/version/include/llpcVersion.h.in b/version/include/llpcVersion.h.in index 8fe424cf37..48aec80a2d 100644 --- a/version/include/llpcVersion.h.in +++ b/version/include/llpcVersion.h.in @@ -37,6 +37,10 @@ // %Version History // | %Version | Change Description | // | -------- | ----------------------------------------------------------------------------------------------------- | +// | 75.1 | Add alphaFunc to GraphicPipelineBuildInfo. | +// | 75.0 | BuildRayTracingPipeline now will not generate kernel entry for pipeline library anymore. | +// | 74.2 | Add enableMapClipDistMask to GraphicsPipelineBuildInfo. | +// | 74.1 | Add AdvancedBlendInternalBinding to InternalBinding. Add enableRov to AdvancedBlendInfo. | // | 74.0 | Replace LlpcRaytracingMode::None with LlpcRaytracingMode::Auto. Now LLPC can choose continuations mode| // | | automatically. Add isCps to RayTracingPipelineBuildOut to notify client continuations mode is chosen. | // | 73.2 | Add imageSampleDrefReturnsRgba to ShaderOptions | @@ -186,7 +190,7 @@ #pragma once /// LLPC major interface version. -#define LLPC_INTERFACE_MAJOR_VERSION 74 +#define LLPC_INTERFACE_MAJOR_VERSION 75 /// LLPC minor interface version. #define LLPC_INTERFACE_MINOR_VERSION 0 From 689c795c881e26bfe63306b409790f73353e0904 Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Wed, 28 Aug 2024 19:13:42 +0800 Subject: [PATCH 2/4] Update --- lgc/patch/LowerPopsInterlock.cpp | 3 +-- lgc/patch/LowerPopsInterlock.h | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lgc/patch/LowerPopsInterlock.cpp b/lgc/patch/LowerPopsInterlock.cpp index 75eb39c7b5..5636ba6dce 100644 --- a/lgc/patch/LowerPopsInterlock.cpp +++ b/lgc/patch/LowerPopsInterlock.cpp @@ -61,8 +61,7 @@ PreservedAnalyses LowerPopsInterlock::run(Function &func, FunctionAnalysisManage m_pipelineState = moduleAnalysisManager.getCachedResult(*func.getParent())->getPipelineState(); m_entryPoint = &func; - BuilderBase builder(m_pipelineState->getContext()); - m_builder = &builder; + m_builder = new BuilderBase(m_pipelineState->getContext()); legalizeInterlock(funcAnalysisManager); lowerInterlock(); diff --git a/lgc/patch/LowerPopsInterlock.h b/lgc/patch/LowerPopsInterlock.h index 0e26d4fb89..a6bea49183 100644 --- a/lgc/patch/LowerPopsInterlock.h +++ b/lgc/patch/LowerPopsInterlock.h @@ -44,6 +44,8 @@ class LowerPopsInterlock : public llvm::PassInfoMixin { static llvm::StringRef name() { return "Lower POPS interlock operations"; } + ~LowerPopsInterlock() { delete m_builder; } + private: void legalizeInterlock(llvm::FunctionAnalysisManager &funcAnalysisManager); void collectBeginInterlock(PopsBeginInterlockOp &popsBeginInterlockOp); From 3e71e207a81983071b8cef1adf76f8ba73e03aec Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Thu, 29 Aug 2024 11:51:33 +0800 Subject: [PATCH 3/4] Update2 --- lgc/patch/LowerPopsInterlock.cpp | 5 +++++ lgc/patch/LowerPopsInterlock.h | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lgc/patch/LowerPopsInterlock.cpp b/lgc/patch/LowerPopsInterlock.cpp index 5636ba6dce..1d79b5870a 100644 --- a/lgc/patch/LowerPopsInterlock.cpp +++ b/lgc/patch/LowerPopsInterlock.cpp @@ -69,6 +69,11 @@ PreservedAnalyses LowerPopsInterlock::run(Function &func, FunctionAnalysisManage return m_changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } +// ===================================================================================================================== +LowerPopsInterlock::~LowerPopsInterlock() { + delete m_builder; +} + // ===================================================================================================================== // Legalize POPS interlock operations. // diff --git a/lgc/patch/LowerPopsInterlock.h b/lgc/patch/LowerPopsInterlock.h index a6bea49183..5fc3b9e848 100644 --- a/lgc/patch/LowerPopsInterlock.h +++ b/lgc/patch/LowerPopsInterlock.h @@ -40,12 +40,12 @@ class PipelineState; class LowerPopsInterlock : public llvm::PassInfoMixin { public: + ~LowerPopsInterlock(); + llvm::PreservedAnalyses run(llvm::Function &func, llvm::FunctionAnalysisManager &funcAnalysisManager); static llvm::StringRef name() { return "Lower POPS interlock operations"; } - ~LowerPopsInterlock() { delete m_builder; } - private: void legalizeInterlock(llvm::FunctionAnalysisManager &funcAnalysisManager); void collectBeginInterlock(PopsBeginInterlockOp &popsBeginInterlockOp); From 24129c2aa5ec05155c3cb86c76ec92a5950ebb1f Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Thu, 29 Aug 2024 13:49:11 +0800 Subject: [PATCH 4/4] Update 3 --- lgc/patch/LowerPopsInterlock.cpp | 7 +------ lgc/patch/LowerPopsInterlock.h | 4 +--- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/lgc/patch/LowerPopsInterlock.cpp b/lgc/patch/LowerPopsInterlock.cpp index 1d79b5870a..c04db3dcc9 100644 --- a/lgc/patch/LowerPopsInterlock.cpp +++ b/lgc/patch/LowerPopsInterlock.cpp @@ -61,7 +61,7 @@ PreservedAnalyses LowerPopsInterlock::run(Function &func, FunctionAnalysisManage m_pipelineState = moduleAnalysisManager.getCachedResult(*func.getParent())->getPipelineState(); m_entryPoint = &func; - m_builder = new BuilderBase(m_pipelineState->getContext()); + m_builder.reset(new BuilderBase(m_pipelineState->getContext())); legalizeInterlock(funcAnalysisManager); lowerInterlock(); @@ -69,11 +69,6 @@ PreservedAnalyses LowerPopsInterlock::run(Function &func, FunctionAnalysisManage return m_changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } -// ===================================================================================================================== -LowerPopsInterlock::~LowerPopsInterlock() { - delete m_builder; -} - // ===================================================================================================================== // Legalize POPS interlock operations. // diff --git a/lgc/patch/LowerPopsInterlock.h b/lgc/patch/LowerPopsInterlock.h index 5fc3b9e848..8fde811aa9 100644 --- a/lgc/patch/LowerPopsInterlock.h +++ b/lgc/patch/LowerPopsInterlock.h @@ -40,8 +40,6 @@ class PipelineState; class LowerPopsInterlock : public llvm::PassInfoMixin { public: - ~LowerPopsInterlock(); - llvm::PreservedAnalyses run(llvm::Function &func, llvm::FunctionAnalysisManager &funcAnalysisManager); static llvm::StringRef name() { return "Lower POPS interlock operations"; } @@ -58,7 +56,7 @@ class LowerPopsInterlock : public llvm::PassInfoMixin { PipelineState *m_pipelineState = nullptr; // Pipeline state llvm::Function *m_entryPoint = nullptr; // Entry-point of fragment shader - BuilderBase *m_builder = nullptr; // LLVM IR builder + std::unique_ptr m_builder; // LLVM IR builder // List of POPS interlock operations llvm::SmallVector m_beginInterlocks;