Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLPC] Scalarize non-uniform loads inside the waterfall loop #2759

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/vkgcDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ struct optional_bool : private std::optional<bool> {
using std::optional<bool>::has_value;
using std::optional<bool>::value;
using std::optional<bool>::value_or;
using std::optional<bool>::operator*;
};

/// Enumerates result codes of LLPC operations.
Expand Down Expand Up @@ -888,7 +889,7 @@ struct PipelineShaderOptions {
unsigned ldsSpillLimitDwords;

/// Attempt to scalarize waterfall descriptor loads.
bool scalarizeWaterfallLoads;
optional_bool scalarizeWaterfallLoads;

/// Force rearranges threadId within group into blocks of 8*8 or 8*4
bool overrideForceThreadIdSwizzling;
Expand Down
288 changes: 245 additions & 43 deletions lgc/builder/BuilderImpl.cpp

Large diffs are not rendered by default.

113 changes: 113 additions & 0 deletions lgc/test/scalarizationOfDescriptorLoadsTest1.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
; ModuleID = 'lgcPipeline'
source_filename = "lgcPipeline"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
target triple = "amdgcn--amdpal"

; Function Attrs: nounwind
define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
.entry:
%0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison)
%.fr = freeze <4 x i32> %0
%__llpc_input_proxy_4.0.vec.extract = extractelement <4 x i32> %.fr, i64 0
%__llpc_input_proxy_4.4.vec.extract = extractelement <4 x i32> %.fr, i64 1
%1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0)
%2 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 0, i32 0)
%3 = mul i32 %__llpc_input_proxy_4.4.vec.extract, %2
%4 = sext i32 %3 to i64
%5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
%6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
%7 = mul i32 %__llpc_input_proxy_4.0.vec.extract, %2
%8 = sext i32 %7 to i64
%9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
%10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
%11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0)
call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1)
ret void
}

; Function Attrs: nounwind willreturn memory(read)
declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1

; Function Attrs: nounwind memory(none)
declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2

; Function Attrs: nounwind memory(none)
declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2

; Function Attrs: nounwind willreturn memory(read)
declare <4 x float> @lgc.create.image.load.v4f32(...) local_unnamed_addr #1

; Function Attrs: nounwind memory(write)
declare void @lgc.create.image.store(...) local_unnamed_addr #3

attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
attributes #1 = { nounwind willreturn memory(read) }
attributes #2 = { nounwind memory(none) }
attributes #3 = { nounwind memory(write) }

!lgc.client = !{!0}
!lgc.options = !{!1}
!lgc.options.VS = !{!2}
!lgc.options.FS = !{!3}
!lgc.user.data.nodes = !{!4, !5, !6, !7}
!lgc.vertex.inputs = !{!8, !9, !10}
!lgc.color.export.formats = !{!11}
!lgc.rasterizer.state = !{!12}
!amdgpu.pal.metadata.msgpack = !{!13}

!0 = !{!"Vulkan"}
!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
!12 = !{i32 0, i32 0, i32 0, i32 1}
!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
!14 = !{i32 0}
!15 = !{i32 1}
!16 = !{}
; CHECK-LABEL: @lgc.shader.VS.main(
; CHECK-NEXT: .entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @lgc.input.import.generic.v4i32(i1 false, i32 2, i32 0, i32 0, i32 poison)
; CHECK-NEXT: [[DOTFR:%.*]] = freeze <4 x i32> [[TMP2]]
; CHECK-NEXT: [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 0
; CHECK-NEXT: [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP3]], i64 0
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT]], 16
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !16
; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT]], 16
; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP14]], align 16, !invariant.load !16
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP12]])
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP12]])
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP19]], align 16, !invariant.load !16
; CHECK-NEXT: [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP20]], i32 0, i32 0, i32 0, i32 0)
; CHECK-NEXT: [[TMP22:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP16]], <4 x float> [[TMP21]])
; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP8]])
; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP23]], i32 [[TMP8]])
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP26]], align 16, !invariant.load !16
; CHECK-NEXT: [[TMP28:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP23]], <4 x i32> [[TMP27]])
; CHECK-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP22]], <4 x i32> [[TMP28]], i32 1, i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
98 changes: 98 additions & 0 deletions lgc/test/scalarizationOfDescriptorLoadsTest2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
; ModuleID = 'lgcPipeline'
source_filename = "lgcPipeline"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
target triple = "amdgcn--amdpal"

; Function Attrs: nounwind
define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
.entry:
%0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison)
%.fr = freeze <4 x i32> %0
%__llpc_input_proxy_4.0.vec.extract = extractelement <4 x i32> %.fr, i64 0
%1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0)
%2 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 0, i32 0)
%3 = mul i32 %__llpc_input_proxy_4.0.vec.extract, %2
%4 = sext i32 %3 to i64
%5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
%6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
%7 = getelementptr i8, ptr addrspace(4) %1, i64 %4
%8 = load <4 x i32>, ptr addrspace(4) %7, align 16, !invariant.load !16
call void (...) @lgc.create.image.store(<4 x i32> %8, i32 0, i32 8, <4 x i32> %6, i32 1)
ret void
}

; Function Attrs: nounwind willreturn memory(read)
declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1

; Function Attrs: nounwind memory(none)
declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2

; Function Attrs: nounwind memory(none)
declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2

; Function Attrs: nounwind memory(write)
declare void @lgc.create.image.store(...) local_unnamed_addr #3

attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
attributes #1 = { nounwind willreturn memory(read) }
attributes #2 = { nounwind memory(none) }
attributes #3 = { nounwind memory(write) }
attributes #4 = { nounwind }

!lgc.client = !{!0}
!lgc.options = !{!1}
!lgc.options.VS = !{!2}
!lgc.options.FS = !{!3}
!lgc.user.data.nodes = !{!4, !5, !6, !7}
!lgc.vertex.inputs = !{!8, !9, !10}
!lgc.color.export.formats = !{!11}
!lgc.rasterizer.state = !{!12}
!amdgpu.pal.metadata.msgpack = !{!13}

!0 = !{!"Vulkan"}
!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
!12 = !{i32 0, i32 0, i32 0, i32 1}
!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
!14 = !{i32 0}
!15 = !{i32 1}
!16 = !{}
; CHECK-LABEL: @lgc.shader.VS.main(
; CHECK-NEXT: .entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @lgc.input.import.generic.v4i32(i1 false, i32 2, i32 0, i32 0, i32 poison)
; CHECK-NEXT: [[DOTFR:%.*]] = freeze <4 x i32> [[TMP2]]
; CHECK-NEXT: [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP3]], i64 0
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT]], 16
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !16
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load !16
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <4 x float>
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP8]])
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP15]], i32 [[TMP8]])
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP17]]
; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load !16
; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP15]], <4 x i32> [[TMP19]])
; CHECK-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP14]], <4 x i32> [[TMP20]], i32 1, i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
Loading
Loading