diff --git a/docs/own/stash.cpp b/docs/own/stash.cpp index e53e7ca..22626e1 100644 --- a/docs/own/stash.cpp +++ b/docs/own/stash.cpp @@ -1,3 +1,364 @@ + +/* +struct CaptureType { + Type* type {}; + bool converted {}; +}; + +CaptureType buildCaptureType(ShaderPatch& patch, const CaptureProcess& cpt) { +} + +struct CaptureProcess { + u32 typeID; + u32 loadedID; + const spc::Meta::Decoration* memberDeco; +}; + +Type* processCapture(ShaderPatch& patch, const CaptureProcess& cpt) { + auto [typeID, loadedID, memberDeco] = cpt; + auto& alloc = patch.alloc; + auto& compiler = patch.compiler; + + auto stype = &compiler.get_type(typeID); + if(stype->pointer) { + dlg_assert(stype->parent_type); + typeID = stype->parent_type; + stype = &compiler.get_type(typeID); + + // TODO load + } + + auto& dst = alloc.construct(); + dst.deco.typeID = typeID; + + auto* meta = compiler.get_ir().find_meta(typeID); + if(meta) { + dst.deco.name = copy(alloc, meta->decoration.alias); + } + + if(memberDeco) { + if(memberDeco->decoration_flags.get(spv::DecorationRowMajor)) { + dst.deco.flags |= Decoration::Bits::rowMajor; + } + if(memberDeco->decoration_flags.get(spv::DecorationColMajor)) { + dst.deco.flags |= Decoration::Bits::colMajor; + } + if(memberDeco->decoration_flags.get(spv::DecorationMatrixStride)) { + dst.deco.matrixStride = memberDeco->matrix_stride; + } + } + + // handle array + if(!stype->array.empty()) { + if(meta && meta->decoration.decoration_flags.get(spv::DecorationArrayStride)) { + dst.deco.arrayStride = meta->decoration.array_stride; + } + + dlg_assert(stype->array.size() == stype->array_size_literal.size()); + dst.array = alloc.alloc(stype->array.size()); + + for(auto d = 0u; d < stype->array.size(); ++d) { + if(stype->array_size_literal[d] == true) { + dst.array[d] = stype->array[d]; + } else { + dst.array[d] = compiler.evaluate_constant_u32(stype->array[d]); + } + } + + dst.deco.arrayTypeID = typeID; + + dlg_assert(stype->parent_type); + typeID = stype->parent_type; + stype = &compiler.get_type(typeID); + meta = compiler.get_ir().find_meta(typeID); + + dst.deco.typeID = typeID; + } + + if(stype->basetype == spc::SPIRType::Struct) { + // handle struct + dst.members = alloc.alloc(stype->member_types.size()); + for(auto i = 0u; i < stype->member_types.size(); ++i) { + auto memTypeID = stype->member_types[i]; + + const spc::Meta::Decoration* deco {}; + if(meta && meta->members.size() > i) { + deco = &meta->members[i]; + } + + // TODO PERF: remove allocation via dlg format here, + // use linearAllocator instead if needed + auto name = dlg::format("?{}", i); + if(deco && !deco->alias.empty()) { + // TODO PERF: we copy here with new, terrible + name = deco->alias; + } + + auto& mdst = dst.members[i]; + mdst.type = processCapture(patch, memTypeID, alloc, deco); + mdst.name = copy(alloc, name); + mdst.offset = deco ? deco->offset : 0u; + + if(!mdst.type) { + return nullptr; + } + } + + dst.type = Type::typeStruct; + return &dst; + } + + // handle atom + auto getBaseType = [](spc::SPIRType::BaseType t) -> std::optional { + switch(t) { + case spc::SPIRType::Double: + case spc::SPIRType::Float: + case spc::SPIRType::Half: + return Type::typeFloat; + + case spc::SPIRType::Int: + case spc::SPIRType::Short: + case spc::SPIRType::Int64: + case spc::SPIRType::SByte: + return Type::typeInt; + + case spc::SPIRType::UInt: + case spc::SPIRType::UShort: + case spc::SPIRType::UInt64: + case spc::SPIRType::UByte: + return Type::typeUint; + + case spc::SPIRType::Boolean: + return Type::typeBool; + + default: + return std::nullopt; + } + }; + + auto bt = getBaseType(stype->basetype); + if(!bt) { + dlg_error("Unsupported shader type: {}", u32(stype->basetype)); + return nullptr; + } + + dst.type = *bt; + dst.width = stype->width; + dst.vecsize = stype->vecsize; + dst.columns = stype->columns; + + return &dst; +} +*/ + +/* +ProcessedCapture processCaptureNonArray(ShaderPatch& patch, LinAllocScope& tms, + Type& type, span loadedIDs) { + u32 copiedTypeID = type.deco.typeID; + span retIDs = loadedIDs; + + if(!type.members.empty()) { + dlg_assert(type.type == Type::typeStruct); + auto copied = tms.alloc(loadedIDs.size()); + + span typeIDs = tms.alloc(loadedIDs.size()); + span> memberIDs = + tms.alloc>(loadedIDs.size()); + for(auto [i, member] : enumerate(type.members)) { + span loadedMembers = tms.alloc(loadedIDs.size()); + for(auto [j, id] : enumerate(loadedIDs)) { + loadedMembers[j] = patch.genOp(spv::OpCompositeExtract, + member.type->array.empty() ? member.type->deco.typeID : member.type->deco.arrayTypeID, + id, i); + } + + auto capture = processCapture(patch, tms, *member.type, loadedMembers); + memberIDs[i] = capture.ids; + typeIDs[i] = capture.typeID; + } + + copiedTypeID = ++patch.freeID; + patch.decl() + .push(copiedTypeID) + .push(typeIDs); + + // TODO offset deco + // TODO copy other member decos! + + for(auto [i, ids] : enumerate(memberIDs)) { + copied[i] = patch.genOp(spv::OpCompositeConstruct, copiedTypeID, ids); + } + + retIDs = copied; + } else if(type.type == Type::typeBool) { + type.type = Type::typeUint; + type.width = 32u; + type.deco.typeID = patch.typeUint; + copiedTypeID = patch.typeUint; + + auto copied = tms.alloc(loadedIDs.size()); + for(auto [i, src] : enumerate(loadedIDs)) { + copied[i] = patch.genOp(spv::OpSelect, patch.typeUint, + src, patch.const1, patch.const0); + } + + retIDs = copied; + } + + ProcessedCapture ret; + ret.typeID = copiedTypeID; + ret.ids = retIDs; + return ret; +} + +ProcessedCapture processCapture(ShaderPatch& patch, LinAllocScope& tms, + Type& type, span loadedIDs) { + if(type.array.empty()) { + return processCaptureNonArray(patch, tms, type, loadedIDs); + } + + auto totalCount = 1u; + for(auto dimSize : type.array) { + totalCount *= dimSize; + } + + span atomIDs = tms.alloc(loadedIDs.size() * totalCount); + u32 typeID = type.deco.arrayTypeID; + auto* spcType = &patch.compiler.get_type(typeID); + + for(auto [i, id] : enumerate(loadedIDs)) { + atomIDs[i * totalCount] = id; + } + auto lastCount = loadedIDs.size(); + auto stride = totalCount; + + for(auto dimSize : reversed(type.array)) { + dlg_assert(dimSize <= stride); + + for(auto srcOff = 0u; srcOff < lastCount; ++srcOff) { + auto srcID = srcOff * stride; + for(auto dstOff = 0u; dstOff < dimSize; ++dstOff) { + auto dstID = srcID + dstOff; + atomIDs[dstID] = patch.genOp(spv::OpCompositeExtract, + typeID, atomIDs[srcID], dstOff); + } + } + + dlg_assert(spcType->parent_type); + u32 typeID = spcType->parent_type; + spcType = &patch.compiler.get_type(typeID); + lastCount *= dimSize; + stride /= dimSize; + } + + dlg_assert(stride == 1u); + dlg_assert(lastCount == totalCount * loadedIDs.size()); + + auto baseCapture = processCaptureNonArray(patch, tms, type, atomIDs); + auto copiedTypeID = baseCapture.typeID; + std::copy(baseCapture.ids.begin(), baseCapture.ids.end(), atomIDs.begin()); + + for(auto dimSize : type.array) { + auto id = ++patch.freeID; + patch.decl() + .push(id) + .push(copiedTypeID) + .push(u32(dimSize)); + + // TODO stride deco. member? + + copiedTypeID = id; + dlg_assert(lastCount % dimSize == 0u); + auto dstCount = lastCount / dimSize; + + for(auto dstOff = 0u; dstOff < dstCount; ++dstOff) { + auto dstID = ++patch.freeID; + auto builder = patch.instr(spv::OpCompositeConstruct); + builder.push(copiedTypeID); + builder.push(dstID); + + for(auto srcOff = 0u; srcOff < dimSize; ++srcOff) { + auto srcID = dstOff * dimSize + srcOff; + builder.push(atomIDs[srcID]); + } + + atomIDs[dstOff] = dstID; + } + + lastCount = dstCount; + } + + dlg_assert(lastCount == loadedIDs.size()); + + ProcessedCapture ret; + ret.typeID = copiedTypeID; + ret.ids = atomIDs.first(lastCount); + return ret; +} + +void fixDecorateCaptureType(ShaderPatch& patch, Type& type) { + const auto& ir = patch.compiler.get_ir(); + if(!type.members.empty()) { + dlg_assert(type.type == Type::typeStruct); + + auto* meta = ir.find_meta(type.deco.typeID); + dlg_assert(meta && meta->members.size() == type.members.size()); + auto needsOffsetDeco = !meta->members[0].decoration_flags.get(spv::DecorationOffset); + auto offset = 0u; + + for(auto [i, member] : enumerate(type.members)) { + fixDecorateCaptureType(patch, *const_cast(member.type)); + + if(needsOffsetDeco) { + dlg_assert(!meta->members[0].decoration_flags.get(spv::DecorationOffset)); + offset = vil::alignPOT(offset, align(type, patch.bufLayout)); + member.offset = offset; + + patch.decl() + .push(type.deco.typeID) + .push(u32(i)) + .push(spv::DecorationOffset) + .push(offset); + + auto dstSize = size(*member.type, patch.bufLayout); + offset += dstSize; + } + } + } + + if(!type.array.empty()) { + dlg_assert(type.deco.arrayTypeID != 0u); + auto* meta = ir.find_meta(type.deco.arrayTypeID); + if(!meta || !meta->decoration.decoration_flags.get(spv::DecorationArrayStride)) { + dlg_assert(type.deco.arrayStride == 0u); + + auto tarray = type.array; + type.array = {}; + type.deco.arrayStride = align( + size(type, patch.bufLayout), + align(type, patch.bufLayout)); + type.array = tarray; + + patch.decl() + .push(type.deco.arrayTypeID) + .push(spv::DecorationArrayStride) + .push(type.deco.arrayStride); + } else { + dlg_assert(type.deco.arrayStride); + } + } + + // TODO: matrixStride + if(type.columns > 1u) { + dlg_error("TODO: add matrixstride deco"); + } +} +*/ + + + +//// +/// #include #include #include diff --git a/docs/own/todo.md b/docs/own/todo.md index f3e0ad0..4a40e0e 100644 --- a/docs/own/todo.md +++ b/docs/own/todo.md @@ -280,11 +280,19 @@ patch capture shader debugging: First gather everything (for every section etc) then do one patch-build pass. Current approach copies again and again, problematic for large shaders. + (NOTE: we already do this now for in-function instructions. + But should be done on per-section basis for decls, too) - [ ] potential CRASH: we assume the pipeLayout still has a valid handle. this might not be the case. We could recreate it, though. + Or make sure it is kept alive? - [ ] separate function arguments and local vars in UI - [ ] toggle via UI: also capture all local named SSA IDs -- [ ] matrix decoration in captured output +- [ ] only add breakpoint when there is prev/curr/next line marker. + Not across functions etc. +- [ ] patching: only load/convert all the variables when we know that this + shader should write stuff. Not in each invocation. + (atm done in processCapture already) +- [x] matrix decoration in captured output - [ ] show global variables in captured output? (entry point interface vars) - [ ] also builtins? maybe in different tab/node? - [ ] ray tracing debugging @@ -298,6 +306,11 @@ patch capture shader debugging: IntrusiveDerivedPtr, something else. Should keep it alive in a different way. - [x] fix terrible pipeline-keepAlive ShaderPatch hack +- [x] convert booleans in captured outputs + - [x] recursively, in structs. +- [x] fix phi instructions in following blocks +- [ ] allow showing all sources of all stages, inserting the breakpoint into all + Shaders should then probably output more about their origin. - [ ] additional shader debug selects (vertex/fragment) - [ ] Layer - [ ] ViewIndex @@ -308,6 +321,9 @@ patch capture shader debugging: - [ ] maybe create the inner handle as an intrusive ptr as well? can we really always use a basePipelineHandle tho? What if related resources got destroyed? + - meh does not help much (nvidia, linux). This time is mainly + problematic for ray tracing pipelines. + [ ] Try out pipeline library spvm: - [x] Add OpSpecConstant* support diff --git a/src/gui/shader.cpp b/src/gui/shader.cpp index 67efc62..f6d98ce 100644 --- a/src/gui/shader.cpp +++ b/src/gui/shader.cpp @@ -149,6 +149,12 @@ void ShaderDebugger::draw() { return; } + if(sourceFilesIDs_.empty()) { + // TODO: allow to show and debug plain spirv without debug sources? + imGuiText("The shader contains no debug sources"); + return; + } + auto* baseCmd = selection().command().back(); auto* stateCmd = deriveCast(baseCmd); dlg_assert(stateCmd->boundPipe()); @@ -438,6 +444,11 @@ const std::string& ShaderDebugger::fileName(u32 fileID) const { dlg_assert(fileID < ir.sources.size()); auto& source = ir.sources[fileID]; + if(source.fileID == 0u) { + static const auto empty = std::string {"unnamed"}; + return empty; + } + auto& str = ir.get(source.fileID); return str.str; } diff --git a/src/pipe.cpp b/src/pipe.cpp index 4570497..a5f9ab4 100644 --- a/src/pipe.cpp +++ b/src/pipe.cpp @@ -619,6 +619,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateRayTracingPipelinesKHR( copy.pLibraries = libHandles.data(); nci.pLibraryInfo = © } + + nci.flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT; } { diff --git a/src/util/buffmt.cpp b/src/util/buffmt.cpp index 95d9a0a..f5945d2 100644 --- a/src/util/buffmt.cpp +++ b/src/util/buffmt.cpp @@ -82,8 +82,8 @@ Type* buildType(const spc::Compiler& compiler, u32 typeID, dst.deco.arrayTypeID = typeID; - dlg_assert(stype->parent_type); - typeID = stype->parent_type; + dlg_assert(stype->self); + typeID = stype->self; stype = &compiler.get_type(typeID); meta = compiler.get_ir().find_meta(typeID); @@ -448,6 +448,12 @@ void display(const char* name, const Type& type, ReadBuf data, u32 offset) { } // display array + // we want the dimension with the highest stride first, makes + // displaying easier. + ThreadMemScope tms; + span array = tms.copy(type.array); + std::reverse(array.begin(), array.end()); + displayArrayDim(name, type, type.array, data, offset); } @@ -479,6 +485,7 @@ unsigned size(const Type& t, BufferLayout bl) { case Type::typeFloat: case Type::typeUint: case Type::typeInt: { + // TODO: matrixStride! auto vec = t.vecsize; if(bl == BufferLayout::std140 && vec == 3u) { vec = 4u; diff --git a/src/util/buffmt.hpp b/src/util/buffmt.hpp index 8217a55..70c31ef 100644 --- a/src/util/buffmt.hpp +++ b/src/util/buffmt.hpp @@ -35,6 +35,7 @@ struct Decoration { Flags flags {}; }; +NYTL_FLAG_OPS(Decoration::Bits); static_assert(std::is_trivially_destructible_v); // TODO: rename or move to namespace, kinda unfitting here. @@ -59,7 +60,7 @@ struct Type { struct Member { std::string_view name; - const Type* type; + Type* type; u32 offset; }; diff --git a/src/util/bufparser.cpp b/src/util/bufparser.cpp index 5a872d4..7821052 100644 --- a/src/util/bufparser.cpp +++ b/src/util/bufparser.cpp @@ -208,7 +208,7 @@ template using control = tao::pegtl::must_if::control builtins = { + static std::unordered_map builtins = { // base types createAtomPair("float", Type::typeFloat, 32, 1, 1), createAtomPair("f32", Type::typeFloat, 32, 1, 1), @@ -384,7 +384,7 @@ struct TreeParser { } } - const Type* parseType(const ParseTreeNode& node) const { + Type* parseType(const ParseTreeNode& node) const { checkType(node); auto name = node.string_view(); @@ -402,7 +402,7 @@ struct TreeParser { return t; } - const Type* applyArrayQualifiers(const ParseTreeNode& quals, const Type& in) { + Type* applyArrayQualifiers(const ParseTreeNode& quals, const Type& in) { checkType(quals); passert(!quals.children.empty(), quals); @@ -517,7 +517,7 @@ struct TreeParser { // Should probably just use a vector (or map/linked list with // LinearAllocator), we don't have so many types that we need an // unordered map - std::unordered_map structs_ {}; + std::unordered_map structs_ {}; const Type* main_ {}; BufferLayout bufferLayout_ {BufferLayout::std430}; // TODO }; diff --git a/src/util/patch.cpp b/src/util/patch.cpp index 0ed33be..849c4d0 100644 --- a/src/util/patch.cpp +++ b/src/util/patch.cpp @@ -1,4 +1,5 @@ #include "command/alloc.hpp" +#include #include #include #include @@ -16,9 +17,13 @@ struct InstrBuilder { spv::Op op; std::vector vals {0}; // first val is reserved + void prepareWrite() { + vals[0] = u16(vals.size()) << 16 | u16(op); + } + [[nodiscard]] u32 insert(std::vector& dst, u32 off) { + prepareWrite(); assert(dst.size() >= off); - vals[0] = u16(vals.size()) << 16 | u16(op); dst.insert(dst.begin() + off, vals.begin(), vals.end()); auto ret = vals.size(); vals.clear(); @@ -33,7 +38,7 @@ struct InstrBuilder { auto off = offsets.unnamed[sectionID + 1]; assert(dst.size() >= off); - vals[0] = u16(vals.size()) << 16 | u16(op); + prepareWrite(); dst.insert(dst.begin() + off, vals.begin(), vals.end()); // update section counts @@ -52,6 +57,12 @@ struct InstrBuilder { return *this; } + template + InstrBuilder& push(span vals) { + for(auto& val : vals) this->push(val); + return *this; + } + InstrBuilder& push(std::string_view val) { for(auto i = 0u; i < val.size(); i += 4) { u32 ret = val[i]; @@ -148,9 +159,9 @@ struct ShaderPatch { const Device& dev; const spc::Compiler& compiler; std::vector copy {}; + std::vector newFuncCode {}; spc::ParsedIR::SectionOffsets offsets {}; u32 freeID {}; - u32 funcInstrOffset {}; u32 typeBool = u32(-1); u32 typeFloat = u32(-1); @@ -208,10 +219,11 @@ struct FuncInstrBuilder : InstrBuilder { return; } - u32 off = patch_.funcInstrOffset; - u32 oldFuncOff = patch_.compiler.get_ir().section_offsets.named.funcs; - off += (patch_.offsets.named.funcs - oldFuncOff); - patch_.funcInstrOffset += InstrBuilder::insert(patch_.copy, off); + InstrBuilder::prepareWrite(); + patch_.newFuncCode.insert(patch_.newFuncCode.end(), + vals.begin(), vals.end()); + vals.clear(); + written_ = true; } }; @@ -394,11 +406,9 @@ void declareConstants(ShaderPatch& patch) { } struct VariableCapture { - u32 varID; u32 typeID; + u32 loadedID; Type* parsed; - - bool isPointer {}; u32 offset {}; }; @@ -409,62 +419,217 @@ bool supportedForCapture(ShaderPatch& patch, const spc::SPIRType& type) { type.basetype <= spc::SPIRType::Struct; } -void fixDecorateCaptureType(ShaderPatch& patch, Type& type) { - const auto& ir = patch.compiler.get_ir(); +// TODO: should we really modify Type* objects here? +// maybe clone them first? + +struct ProcessedCapture { + u32 typeID; + span ids; +}; + +ProcessedCapture processCapture(ShaderPatch& patch, LinAllocScope& tms, + Type& type, span loadedIDs); +ProcessedCapture processCaptureNonArray(ShaderPatch& patch, LinAllocScope& tms, + Type& type, span loadedIDs); + +ProcessedCapture processCaptureArray(ShaderPatch& patch, LinAllocScope& tms, + Type& type, span loadedIDs, span remArrayDims, + const spc::SPIRType& spcType) { + dlg_assert(!remArrayDims.empty()); + + auto dimSize = remArrayDims[0]; + dlg_assert(dimSize > 0); + remArrayDims = remArrayDims.subspan(1u); + + auto expanded = tms.alloc(loadedIDs.size() * dimSize); + for(auto i = 0u; i < loadedIDs.size(); ++i) { + for(auto j = 0u; j < dimSize; ++j) { + expanded[i * dimSize + j] = patch.genOp(spv::OpCompositeExtract, + spcType.parent_type, loadedIDs[i], j); + } + } + + auto subCapture = remArrayDims.empty() ? + processCaptureNonArray(patch, tms, type, expanded) : + processCaptureArray(patch, tms, type, expanded, remArrayDims, + patch.compiler.get_type(spcType.parent_type)); + + auto dstTypeID = ++patch.freeID; + patch.decl() + .push(dstTypeID) + .push(subCapture.typeID) + .push(u32(dimSize)); + + type.deco.arrayStride = align( + size(type, patch.bufLayout), + align(type, patch.bufLayout)); + patch.decl() + .push(dstTypeID) + .push(spv::DecorationArrayStride) + .push(type.deco.arrayStride); + + for(auto i = 0u; i < loadedIDs.size(); ++i) { + auto dstID = ++patch.freeID; + auto builder = patch.instr(spv::OpCompositeConstruct); + builder.push(dstTypeID); + builder.push(dstID); + + for(auto j = 0u; j < dimSize; ++j) { + auto srcID = i * dimSize + j; + builder.push(subCapture.ids[srcID]); + } + + expanded[i] = dstID; + } + + ProcessedCapture ret; + ret.typeID = dstTypeID; + ret.ids = expanded.first(loadedIDs.size()); + return ret; +} + +ProcessedCapture processCaptureNonArray(ShaderPatch& patch, LinAllocScope& tms, + Type& type, span loadedIDs) { + u32 copiedTypeID = type.deco.typeID; + span retIDs = loadedIDs; + if(!type.members.empty()) { + copiedTypeID = ++patch.freeID; + type.deco.typeID = copiedTypeID; + dlg_assert(type.type == Type::typeStruct); + auto copied = tms.alloc(loadedIDs.size()); - auto* meta = ir.find_meta(type.deco.typeID); - dlg_assert(meta && meta->members.size() == type.members.size()); - auto needsOffsetDeco = !meta->members[0].decoration_flags.get(spv::DecorationOffset); + span typeIDs = tms.alloc(type.members.size()); + span> memberIDs = + tms.alloc>(type.members.size()); auto offset = 0u; for(auto [i, member] : enumerate(type.members)) { - fixDecorateCaptureType(patch, *const_cast(member.type)); + span loadedMembers = tms.alloc(loadedIDs.size()); + for(auto [j, id] : enumerate(loadedIDs)) { + auto memberType = member.type->array.empty() ? + member.type->deco.typeID : member.type->deco.arrayTypeID; + loadedMembers[j] = patch.genOp(spv::OpCompositeExtract, + memberType, id, u32(i)); + } - if(needsOffsetDeco) { - dlg_assert(!meta->members[0].decoration_flags.get(spv::DecorationOffset)); - offset = vil::alignPOT(offset, align(type, patch.bufLayout)); - member.offset = offset; + auto capture = processCapture(patch, tms, *member.type, loadedMembers); + memberIDs[i] = capture.ids; + typeIDs[i] = capture.typeID; + offset = vil::alignPOT(offset, align(type, patch.bufLayout)); + patch.decl() + .push(copiedTypeID) + .push(u32(i)) + .push(spv::DecorationOffset) + .push(offset); + + member.offset = offset; + auto memberSize = size(*member.type, patch.bufLayout); + + if(type.columns > 1u) { patch.decl() - .push(type.deco.typeID) + .push(copiedTypeID) .push(u32(i)) - .push(spv::DecorationOffset) - .push(offset); + .push(spv::DecorationRowMajor); - auto dstSize = size(*member.type, patch.bufLayout); - offset += dstSize; + member.type->deco.flags &= ~Decoration::Bits::colMajor; + member.type->deco.flags |= Decoration::Bits::rowMajor; + + auto nc = member.type->vecsize; + if (patch.bufLayout == BufferLayout::std140 && nc == 3u) { + nc = 4u; + } + auto matrixStride = nc * member.type->width / 32u; + member.type->deco.matrixStride = matrixStride; + patch.decl() + .push(copiedTypeID) + .push(u32(i)) + .push(spv::DecorationMatrixStride) + .push(nc); } + + offset += memberSize; } - } - if(!type.array.empty()) { - dlg_assert(type.deco.arrayTypeID != 0u); - auto* meta = ir.find_meta(type.deco.arrayTypeID); - if(!meta || !meta->decoration.decoration_flags.get(spv::DecorationArrayStride)) { - dlg_assert(type.deco.arrayStride == 0u); + patch.decl() + .push(copiedTypeID) + .push(typeIDs); - auto tarray = type.array; - type.array = {}; - type.deco.arrayStride = align( - size(type, patch.bufLayout), - align(type, patch.bufLayout)); - type.array = tarray; + auto structBuild = tms.alloc(type.members.size()); + for(auto [i, dst] : enumerate(copied)) { + for(auto [j, ids] : enumerate(memberIDs)) { + structBuild[j] = ids[i]; + } + dst = patch.genOp(spv::OpCompositeConstruct, copiedTypeID, structBuild); + } - patch.decl() - .push(type.deco.arrayTypeID) - .push(spv::DecorationArrayStride) - .push(type.deco.arrayStride); - } else { - dlg_assert(type.deco.arrayStride); + retIDs = copied; + } else if(type.type == Type::typeBool) { + type.type = Type::typeUint; + type.width = 32u; + type.deco.typeID = patch.typeUint; + copiedTypeID = patch.typeUint; + + auto copied = tms.alloc(loadedIDs.size()); + for(auto [i, src] : enumerate(loadedIDs)) { + copied[i] = patch.genOp(spv::OpSelect, patch.typeUint, + src, patch.const1, patch.const0); } + + retIDs = copied; + } + + ProcessedCapture ret; + ret.typeID = copiedTypeID; + ret.ids = retIDs; + return ret; +} + +ProcessedCapture processCapture(ShaderPatch& patch, LinAllocScope& tms, + Type& type, span loadedIDs) { + // TODO: error-out for pointers. + // make sure to just ignore members that are pointers. + + if(type.array.empty()) { + return processCaptureNonArray(patch, tms, type, loadedIDs); + } + + span array = tms.copy(type.array); + std::reverse(array.begin(), array.end()); + return processCaptureArray(patch, tms, type, loadedIDs, array, + patch.compiler.get_type(type.deco.arrayTypeID)); +} + +struct VarCapture { + Type* type; + u32 resTypeID; + u32 loaded; +}; + +VarCapture processCapture(ShaderPatch& patch, u32 varID, u32 typeID) { + auto& compiler = patch.compiler; + auto& ir = compiler.get_ir(); + auto& srcType = ir.get(typeID); + if(!supportedForCapture(patch, srcType)) { + return {}; } - // TODO: matrixStride - if(type.columns > 1u) { - dlg_error("TODO: add matrixstride deco"); + auto loaded = varID; + + if(srcType.pointer) { + dlg_assert(srcType.pointer_depth == 1u); + typeID = srcType.parent_type; + loaded = patch.genOp(spv::OpLoad, typeID, varID); } + + auto* parsedType = buildType(patch.compiler, typeID, patch.alloc); + ThreadMemScope tms; + + auto captureRes = processCapture(patch, tms, *parsedType, {{loaded}}); + dlg_assert(captureRes.ids.size() == 1u); + return {parsedType, captureRes.typeID, captureRes.ids[0]}; } u32 findBuiltin(ShaderPatch& patch, @@ -546,7 +711,7 @@ u32 generateInvocationVertex(ShaderPatch& patch, auto& ir = patch.compiler.get_ir(); - auto extName = "SPV_KHR_shader_draw_parameters"; + auto extName = std::string_view("SPV_KHR_shader_draw_parameters"); if(!contains(ir.declared_extensions, extName)) { patch.decl().push(extName); } @@ -614,6 +779,60 @@ u32 generateCurrentInvocation(ShaderPatch& patch, } } +void fixPhiInstructions(ShaderPatch& patch, u32 dstBlock, + u32 oldBlockEnd, u32 newBlockEnd) { + auto& ir = patch.compiler.get_ir(); + auto& block = ir.get(dstBlock); + + for(auto [phiOff] : block.phi_instructions) { + auto eOffset = patch.offsets.named.funcs - ir.section_offsets.named.funcs; + auto toff = phiOff + eOffset; + auto& instrHead = patch.copy[toff]; + auto numWords = (instrHead >> 16u) & 0xFFFFu; + dlg_assert((instrHead & 0xFFFFu) == spv::OpPhi); + + // auto resType = patch.copy[toff + 1]; + // auto resID = patch.copy[toff + 2]; + + auto found = false; + for(auto i = 3u; i + 1 < numWords; i += 2) { + // auto parentVar = patch.copy[toff + i + 0]; + auto parentBlock = patch.copy[toff + i + 1]; + if(parentBlock == oldBlockEnd) { + dlg_assert(!found); + found = true; + + patch.copy[toff + i + 1] = newBlockEnd; + } + } + + dlg_assert(found); + } +} + +void fixPhiInstructions(ShaderPatch& path, spc::SPIRBlock& srcBlock, + u32 newBlockEnd) { + auto oldBlockEnd = srcBlock.self; + if(srcBlock.terminator == spc::SPIRBlock::Direct) { + fixPhiInstructions(path, srcBlock.next_block, + oldBlockEnd, newBlockEnd); + } else if(srcBlock.terminator == spc::SPIRBlock::Select) { + fixPhiInstructions(path, srcBlock.true_block, + oldBlockEnd, newBlockEnd); + fixPhiInstructions(path, srcBlock.false_block, + oldBlockEnd, newBlockEnd); + } else if(srcBlock.terminator == spc::SPIRBlock::MultiSelect) { + for(auto& caseBlock : srcBlock.cases_32bit) { + fixPhiInstructions(path, caseBlock.block, + oldBlockEnd, newBlockEnd); + } + for(auto& caseBlock : srcBlock.cases_64bit) { + fixPhiInstructions(path, caseBlock.block, + oldBlockEnd, newBlockEnd); + } + } +} + PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler, u32 file, u32 line, u64 captureAddress, const std::string& entryPointName, @@ -667,19 +886,19 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler, addressing = u32(spv::AddressingModelPhysicalStorageBuffer64); } - findDeclareBaseTypes(patch); - declareConstants(patch); + auto capName = spv::CapabilityPhysicalStorageBufferAddresses; + if(!contains(ir.declared_capabilities, capName)) { + patch.decl().push(capName); + } // allow us to use physical storage buffer pointers - auto extName = "SPV_KHR_physical_storage_buffer"; + auto extName = std::string_view("SPV_KHR_physical_storage_buffer"); if(!contains(ir.declared_extensions, extName)) { patch.decl().push(extName); } - auto capName = spv::CapabilityPhysicalStorageBufferAddresses; - if(!contains(ir.declared_capabilities, capName)) { - patch.decl() .push(capName); - } + findDeclareBaseTypes(patch); + declareConstants(patch); // parse local variables vil::Type baseType; @@ -691,18 +910,20 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler, auto offset = 0u; std::vector captures; - auto addCapture = [&](u32 varID, u32 typeID, bool isPointer, const std::string& name) { - auto* parsedType = buildType(patch.compiler, typeID, patch.alloc); - fixDecorateCaptureType(patch, *parsedType); + auto addCapture = [&](u32 varID, u32 typeID, const std::string& name) { + auto [parsedType, loadedType, loaded] = processCapture(patch, varID, typeID); + + if(!parsedType) { + return; + } offset = alignPOT(offset, align(*parsedType, patch.bufLayout)); auto& capture = captures.emplace_back(); capture.parsed = parsedType; - capture.typeID = typeID; - capture.varID = varID; capture.offset = offset; - capture.isPointer = isPointer; + capture.typeID = loadedType; + capture.loadedID = loaded; auto& member = members.emplace_back(); member.type = capture.parsed; @@ -730,15 +951,7 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler, continue; } - auto& srcType = ir.get(var.basetype); - if(!supportedForCapture(patch, srcType)) { - continue; - } - - dlg_assert(srcType.pointer); - dlg_assert(srcType.pointer_depth == 1u); - - addCapture(varID, srcType.parent_type, true, name); + addCapture(varID, var.basetype, name); } // capture function arguments @@ -748,20 +961,7 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler, continue; } - auto& srcType = ir.get(param.type); - if(!supportedForCapture(patch, srcType)) { - continue; - } - - auto typeID = param.type; - bool isPointer = false; - if(srcType.pointer) { - dlg_assert(srcType.pointer_depth == 1u); - typeID = srcType.parent_type; - isPointer = true; - } - - addCapture(param.id, typeID, isPointer, name); + addCapture(param.id, param.type, name); } // declare that struct type in spirv [patch] @@ -815,28 +1015,14 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler, .push(addressConstLow) .push(addressConstHigh); - // find builtin GlobalInvocationID - // - construct struct C via OpCompositeConstruct - patch.funcInstrOffset = lb->offset; + // construct struct C via OpCompositeConstruct auto srcStruct = ++freeID; { auto builder = patch.instr(spv::OpCompositeConstruct); builder.push(captureStruct); builder.push(srcStruct); for(auto [i, capture] : enumerate(captures)) { - u32 memID {}; - - if(capture.isPointer) { - memID = ++freeID; - patch.instr(spv::OpLoad) - .push(capture.typeID) - .push(memID) - .push(capture.varID); - } else { - memID = capture.varID; - } - - builder.push(memID); + builder.push(capture.loadedID); } } @@ -959,6 +1145,15 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler, // rest of the current block patch.instr(spv::OpLabel, blockRest); + // we changed control flow, have to fix phi instructions in following + // blocks. + fixPhiInstructions(patch, *lb->block, blockRest); + + // insert new function instructions + auto off = (patch.offsets.named.funcs - ir.section_offsets.named.funcs); + copy.insert(copy.begin() + lb->offset + off, + patch.newFuncCode.begin(), patch.newFuncCode.end()); + // update interface of entry point auto eOffset = patch.offsets.named.entry_points - ir.section_offsets.named.entry_points; auto& instrHead = patch.copy[entryPoint.offset + eOffset]; @@ -1111,6 +1306,15 @@ vku::Pipeline createPatchCopy(const RayTracingPipeline& src, rti.layout = src.layout->handle; rti.maxPipelineRayRecursionDepth = src.maxPipelineRayRecursionDepth; + // TODO DATA RACE! + /* + if(src.handle) { + rti.basePipelineHandle = src.handle; + rti.flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT; + rti.basePipelineIndex = -1; + } + */ + std::vector dynStates { src.dynamicState.begin(), src.dynamicState.end()}; VkPipelineDynamicStateCreateInfo dynState {}; @@ -1140,7 +1344,49 @@ vku::Pipeline createPatchCopy(const RayTracingPipeline& src, rti.groupCount = groups.size(); rti.pGroups = groups.data(); - auto pipe = vku::Pipeline(dev, rti); + vku::Pipeline pipe; + constexpr auto useDeferredOp = false; + + if(useDeferredOp) { + VkDeferredOperationKHR dop {}; + VK_CHECK(dev.dispatch.CreateDeferredOperationKHR(dev.handle, nullptr, &dop)); + + VkPipeline vkPipe; + auto res = dev.dispatch.CreateRayTracingPipelinesKHR(dev.handle, dop, {}, 1u, + &rti, nullptr, &vkPipe); + dlg_assert(res == VK_OPERATION_DEFERRED_KHR); + + auto maxThreads = dev.dispatch.GetDeferredOperationMaxConcurrencyKHR( + dev.handle, dop); + dlg_trace("maxThreads: {}", maxThreads); + + auto cb = [&]{ + auto res = dev.dispatch.DeferredOperationJoinKHR(dev.handle, dop); + dlg_trace("res: {}", res); + }; + + std::vector> futures; + for(auto i = 0u; i < std::min(7u, maxThreads - 1); ++i) { + futures.emplace_back(std::async(std::launch::async, cb)); + } + + do { + res = dev.dispatch.DeferredOperationJoinKHR(dev.handle, dop); + } while(res == VK_THREAD_IDLE_KHR); + + dlg_trace("main res {}", res); + + for(auto& future : futures) { + future.get(); + } + + res = dev.dispatch.GetDeferredOperationResultKHR(dev.handle, dop); + dlg_assert(res == VK_SUCCESS); + + pipe = vku::Pipeline(dev, vkPipe); + } else { + pipe = vku::Pipeline(dev, rti); + } auto handleSize = dev.rtProps.shaderGroupHandleSize; dlg_assert(handleSize % 4u == 0u); @@ -1254,6 +1500,8 @@ PatchJobResult patchJob(PatchJobData& data) { output += std::to_string(hash); output += ".spv"; writeFile(output.c_str(), bytes(patchRes.copy), true); + + dlg_trace("Wrote {}, {} bytes", output, bytes(patchRes.copy).size()); #endif // VIL_OUTPUT_PATCHED_SPIRV OwnBuffer shaderTableMapping;