diff --git a/docs/own/stash.cpp b/docs/own/stash.cpp
index e53e7ca..22626e1 100644
--- a/docs/own/stash.cpp
+++ b/docs/own/stash.cpp
@@ -1,3 +1,364 @@
+
+/*
+struct CaptureType {
+	Type* type {};
+	bool converted {};
+};
+
+CaptureType buildCaptureType(ShaderPatch& patch, const CaptureProcess& cpt) {
+}
+
+struct CaptureProcess {
+	u32 typeID;
+	u32 loadedID;
+	const spc::Meta::Decoration* memberDeco;
+};
+
+Type* processCapture(ShaderPatch& patch, const CaptureProcess& cpt) {
+	auto [typeID, loadedID, memberDeco] = cpt;
+	auto& alloc = patch.alloc;
+	auto& compiler = patch.compiler;
+
+	auto stype = &compiler.get_type(typeID);
+	if(stype->pointer) {
+		dlg_assert(stype->parent_type);
+		typeID = stype->parent_type;
+		stype = &compiler.get_type(typeID);
+
+		// TODO load
+	}
+
+	auto& dst = alloc.construct<Type>();
+	dst.deco.typeID = typeID;
+
+	auto* meta = compiler.get_ir().find_meta(typeID);
+	if(meta) {
+		dst.deco.name = copy(alloc, meta->decoration.alias);
+	}
+
+	if(memberDeco) {
+		if(memberDeco->decoration_flags.get(spv::DecorationRowMajor)) {
+			dst.deco.flags |= Decoration::Bits::rowMajor;
+		}
+		if(memberDeco->decoration_flags.get(spv::DecorationColMajor)) {
+			dst.deco.flags |= Decoration::Bits::colMajor;
+		}
+		if(memberDeco->decoration_flags.get(spv::DecorationMatrixStride)) {
+			dst.deco.matrixStride = memberDeco->matrix_stride;
+		}
+	}
+
+	// handle array
+	if(!stype->array.empty()) {
+		if(meta && meta->decoration.decoration_flags.get(spv::DecorationArrayStride)) {
+			dst.deco.arrayStride = meta->decoration.array_stride;
+		}
+
+		dlg_assert(stype->array.size() == stype->array_size_literal.size());
+		dst.array = alloc.alloc<u32>(stype->array.size());
+
+		for(auto d = 0u; d < stype->array.size(); ++d) {
+			if(stype->array_size_literal[d] == true) {
+				dst.array[d] = stype->array[d];
+			} else {
+				dst.array[d] = compiler.evaluate_constant_u32(stype->array[d]);
+			}
+		}
+
+		dst.deco.arrayTypeID = typeID;
+
+		dlg_assert(stype->parent_type);
+		typeID = stype->parent_type;
+		stype = &compiler.get_type(typeID);
+		meta = compiler.get_ir().find_meta(typeID);
+
+		dst.deco.typeID = typeID;
+	}
+
+	if(stype->basetype == spc::SPIRType::Struct) {
+		// handle struct
+		dst.members = alloc.alloc<Type::Member>(stype->member_types.size());
+		for(auto i = 0u; i < stype->member_types.size(); ++i) {
+			auto memTypeID = stype->member_types[i];
+
+			const spc::Meta::Decoration* deco {};
+			if(meta && meta->members.size() > i) {
+				deco = &meta->members[i];
+			}
+
+			// TODO PERF: remove allocation via dlg format here,
+			// use linearAllocator instead if needed
+			auto name = dlg::format("?{}", i);
+			if(deco && !deco->alias.empty()) {
+				// TODO PERF: we copy here with new, terrible
+				name = deco->alias;
+			}
+
+			auto& mdst = dst.members[i];
+			mdst.type = processCapture(patch, memTypeID, alloc, deco);
+			mdst.name = copy(alloc, name);
+			mdst.offset = deco ? deco->offset : 0u;
+
+			if(!mdst.type) {
+				return nullptr;
+			}
+		}
+
+		dst.type = Type::typeStruct;
+		return &dst;
+	}
+
+	// handle atom
+	auto getBaseType = [](spc::SPIRType::BaseType t) -> std::optional<Type::BaseType> {
+		switch(t) {
+			case spc::SPIRType::Double:
+			case spc::SPIRType::Float:
+			case spc::SPIRType::Half:
+				return Type::typeFloat;
+
+			case spc::SPIRType::Int:
+			case spc::SPIRType::Short:
+			case spc::SPIRType::Int64:
+			case spc::SPIRType::SByte:
+				return Type::typeInt;
+
+			case spc::SPIRType::UInt:
+			case spc::SPIRType::UShort:
+			case spc::SPIRType::UInt64:
+			case spc::SPIRType::UByte:
+				return Type::typeUint;
+
+			case spc::SPIRType::Boolean:
+				return Type::typeBool;
+
+			default:
+				return std::nullopt;
+		}
+	};
+
+	auto bt = getBaseType(stype->basetype);
+	if(!bt) {
+		dlg_error("Unsupported shader type: {}", u32(stype->basetype));
+		return nullptr;
+	}
+
+	dst.type = *bt;
+	dst.width = stype->width;
+	dst.vecsize = stype->vecsize;
+	dst.columns = stype->columns;
+
+	return &dst;
+}
+*/
+
+/*
+ProcessedCapture processCaptureNonArray(ShaderPatch& patch, LinAllocScope& tms,
+		Type& type, span<const u32> loadedIDs) {
+	u32 copiedTypeID = type.deco.typeID;
+	span<const u32> retIDs = loadedIDs;
+
+	if(!type.members.empty()) {
+		dlg_assert(type.type == Type::typeStruct);
+		auto copied = tms.alloc<u32>(loadedIDs.size());
+
+		span<u32> typeIDs = tms.alloc<u32>(loadedIDs.size());
+		span<span<const u32>> memberIDs =
+			tms.alloc<span<const u32>>(loadedIDs.size());
+		for(auto [i, member] : enumerate(type.members)) {
+			span<u32> loadedMembers = tms.alloc<u32>(loadedIDs.size());
+			for(auto [j, id] : enumerate(loadedIDs)) {
+				loadedMembers[j] = patch.genOp(spv::OpCompositeExtract,
+					member.type->array.empty() ? member.type->deco.typeID : member.type->deco.arrayTypeID,
+					id, i);
+			}
+
+			auto capture = processCapture(patch, tms, *member.type, loadedMembers);
+			memberIDs[i] = capture.ids;
+			typeIDs[i] = capture.typeID;
+		}
+
+		copiedTypeID = ++patch.freeID;
+		patch.decl<spv::OpTypeStruct>()
+			.push(copiedTypeID)
+			.push(typeIDs);
+
+		// TODO offset deco
+		// TODO copy other member decos!
+
+		for(auto [i, ids] : enumerate(memberIDs)) {
+			copied[i] = patch.genOp(spv::OpCompositeConstruct, copiedTypeID, ids);
+		}
+
+		retIDs = copied;
+	} else if(type.type == Type::typeBool) {
+		type.type = Type::typeUint;
+		type.width = 32u;
+		type.deco.typeID = patch.typeUint;
+		copiedTypeID = patch.typeUint;
+
+		auto copied = tms.alloc<u32>(loadedIDs.size());
+		for(auto [i, src] : enumerate(loadedIDs)) {
+			copied[i] = patch.genOp(spv::OpSelect, patch.typeUint,
+				src, patch.const1, patch.const0);
+		}
+
+		retIDs = copied;
+	}
+
+	ProcessedCapture ret;
+	ret.typeID = copiedTypeID;
+	ret.ids = retIDs;
+	return ret;
+}
+
+ProcessedCapture processCapture(ShaderPatch& patch, LinAllocScope& tms,
+		Type& type, span<const u32> loadedIDs) {
+	if(type.array.empty()) {
+		return processCaptureNonArray(patch, tms, type, loadedIDs);
+	}
+
+	auto totalCount = 1u;
+	for(auto dimSize : type.array) {
+		totalCount *= dimSize;
+	}
+
+	span<u32> atomIDs = tms.alloc<u32>(loadedIDs.size() * totalCount);
+	u32 typeID = type.deco.arrayTypeID;
+	auto* spcType = &patch.compiler.get_type(typeID);
+
+	for(auto [i, id] : enumerate(loadedIDs)) {
+		atomIDs[i * totalCount] = id;
+	}
+	auto lastCount = loadedIDs.size();
+	auto stride = totalCount;
+
+	for(auto dimSize : reversed(type.array)) {
+		dlg_assert(dimSize <= stride);
+
+		for(auto srcOff = 0u; srcOff < lastCount; ++srcOff) {
+			auto srcID = srcOff * stride;
+			for(auto dstOff = 0u; dstOff < dimSize; ++dstOff) {
+				auto dstID = srcID + dstOff;
+				atomIDs[dstID] = patch.genOp(spv::OpCompositeExtract,
+					typeID, atomIDs[srcID], dstOff);
+			}
+		}
+
+		dlg_assert(spcType->parent_type);
+		u32 typeID = spcType->parent_type;
+		spcType = &patch.compiler.get_type(typeID);
+		lastCount *= dimSize;
+		stride /= dimSize;
+	}
+
+	dlg_assert(stride == 1u);
+	dlg_assert(lastCount == totalCount * loadedIDs.size());
+
+	auto baseCapture = processCaptureNonArray(patch, tms, type, atomIDs);
+	auto copiedTypeID = baseCapture.typeID;
+	std::copy(baseCapture.ids.begin(), baseCapture.ids.end(), atomIDs.begin());
+
+	for(auto dimSize : type.array) {
+		auto id = ++patch.freeID;
+		patch.decl<spv::OpTypeArray>()
+			.push(id)
+			.push(copiedTypeID)
+			.push(u32(dimSize));
+
+		// TODO stride deco. member?
+
+		copiedTypeID = id;
+		dlg_assert(lastCount % dimSize == 0u);
+		auto dstCount = lastCount / dimSize;
+
+		for(auto dstOff = 0u; dstOff < dstCount; ++dstOff) {
+			auto dstID = ++patch.freeID;
+			auto builder = patch.instr(spv::OpCompositeConstruct);
+			builder.push(copiedTypeID);
+			builder.push(dstID);
+
+			for(auto srcOff = 0u; srcOff < dimSize; ++srcOff) {
+				auto srcID = dstOff * dimSize + srcOff;
+				builder.push(atomIDs[srcID]);
+			}
+
+			atomIDs[dstOff] = dstID;
+		}
+
+		lastCount = dstCount;
+	}
+
+	dlg_assert(lastCount == loadedIDs.size());
+
+	ProcessedCapture ret;
+	ret.typeID = copiedTypeID;
+	ret.ids = atomIDs.first(lastCount);
+	return ret;
+}
+
+void fixDecorateCaptureType(ShaderPatch& patch, Type& type) {
+	const auto& ir = patch.compiler.get_ir();
+	if(!type.members.empty()) {
+		dlg_assert(type.type == Type::typeStruct);
+
+		auto* meta = ir.find_meta(type.deco.typeID);
+		dlg_assert(meta && meta->members.size() == type.members.size());
+		auto needsOffsetDeco = !meta->members[0].decoration_flags.get(spv::DecorationOffset);
+		auto offset = 0u;
+
+		for(auto [i, member] : enumerate(type.members)) {
+			fixDecorateCaptureType(patch, *const_cast<Type*>(member.type));
+
+			if(needsOffsetDeco) {
+				dlg_assert(!meta->members[0].decoration_flags.get(spv::DecorationOffset));
+				offset = vil::alignPOT(offset, align(type, patch.bufLayout));
+				member.offset = offset;
+
+				patch.decl<spv::OpMemberDecorate>()
+					.push(type.deco.typeID)
+					.push(u32(i))
+					.push(spv::DecorationOffset)
+					.push(offset);
+
+				auto dstSize = size(*member.type, patch.bufLayout);
+				offset += dstSize;
+			}
+		}
+	}
+
+	if(!type.array.empty()) {
+		dlg_assert(type.deco.arrayTypeID != 0u);
+		auto* meta = ir.find_meta(type.deco.arrayTypeID);
+		if(!meta || !meta->decoration.decoration_flags.get(spv::DecorationArrayStride)) {
+			dlg_assert(type.deco.arrayStride == 0u);
+
+			auto tarray = type.array;
+			type.array = {};
+			type.deco.arrayStride = align(
+				size(type, patch.bufLayout),
+				align(type, patch.bufLayout));
+			type.array = tarray;
+
+			patch.decl<spv::OpDecorate>()
+				.push(type.deco.arrayTypeID)
+				.push(spv::DecorationArrayStride)
+				.push(type.deco.arrayStride);
+		} else {
+			dlg_assert(type.deco.arrayStride);
+		}
+	}
+
+	// TODO: matrixStride
+	if(type.columns > 1u) {
+		dlg_error("TODO: add matrixstride deco");
+	}
+}
+*/
+
+
+
+////
+///
 #include <fwd.hpp>
 #include <commandDesc.hpp>
 #include <cb.hpp>
diff --git a/docs/own/todo.md b/docs/own/todo.md
index f3e0ad0..4a40e0e 100644
--- a/docs/own/todo.md
+++ b/docs/own/todo.md
@@ -280,11 +280,19 @@ patch capture shader debugging:
       First gather everything (for every section etc) then do one
 	  patch-build pass.
 	  Current approach copies again and again, problematic for large shaders.
+	  (NOTE: we already do this now for in-function instructions.
+	   But should be done on per-section basis for decls, too)
 - [ ] potential CRASH: we assume the pipeLayout still has a valid handle.
 	  this might not be the case. We could recreate it, though.
+	  Or make sure it is kept alive?
 - [ ] separate function arguments and local vars in UI
 - [ ] toggle via UI: also capture all local named SSA IDs
-- [ ] matrix decoration in captured output
+- [ ] only add breakpoint when there is prev/curr/next line marker.
+      Not across functions etc.
+- [ ] patching: only load/convert all the variables when we know that this
+      shader should write stuff. Not in each invocation.
+	  (atm done in processCapture already)
+- [x] matrix decoration in captured output
 - [ ] show global variables in captured output? (entry point interface vars)
 	- [ ] also builtins? maybe in different tab/node?
 - [ ] ray tracing debugging
@@ -298,6 +306,11 @@ patch capture shader debugging:
       IntrusiveDerivedPtr, something else. Should keep it alive in
 	  a different way.
 - [x] fix terrible pipeline-keepAlive ShaderPatch hack
+- [x] convert booleans in captured outputs
+	- [x] recursively, in structs.
+- [x] fix phi instructions in following blocks
+- [ ] allow showing all sources of all stages, inserting the breakpoint into all
+      Shaders should then probably output more about their origin.
 - [ ] additional shader debug selects (vertex/fragment)
 	- [ ] Layer
 	- [ ] ViewIndex
@@ -308,6 +321,9 @@ patch capture shader debugging:
 	- [ ] maybe create the inner handle as an intrusive ptr as well?
 	      can we really always use a basePipelineHandle tho?
 		  What if related resources got destroyed?
+	- meh does not help much (nvidia, linux). This time is mainly
+	  problematic for ray tracing pipelines.
+	  [ ] Try out pipeline library
 
 spvm:
 - [x] Add OpSpecConstant* support
diff --git a/src/gui/shader.cpp b/src/gui/shader.cpp
index 67efc62..f6d98ce 100644
--- a/src/gui/shader.cpp
+++ b/src/gui/shader.cpp
@@ -149,6 +149,12 @@ void ShaderDebugger::draw() {
 		return;
 	}
 
+	if(sourceFilesIDs_.empty()) {
+		// TODO: allow to show and debug plain spirv without debug sources?
+		imGuiText("The shader contains no debug sources");
+		return;
+	}
+
 	auto* baseCmd = selection().command().back();
 	auto* stateCmd = deriveCast<const StateCmdBase*>(baseCmd);
 	dlg_assert(stateCmd->boundPipe());
@@ -438,6 +444,11 @@ const std::string& ShaderDebugger::fileName(u32 fileID) const {
 	dlg_assert(fileID < ir.sources.size());
 
 	auto& source = ir.sources[fileID];
+	if(source.fileID == 0u) {
+		static const auto empty = std::string {"unnamed"};
+		return empty;
+	}
+
 	auto& str = ir.get<spc::SPIRString>(source.fileID);
 	return str.str;
 }
diff --git a/src/pipe.cpp b/src/pipe.cpp
index 4570497..a5f9ab4 100644
--- a/src/pipe.cpp
+++ b/src/pipe.cpp
@@ -619,6 +619,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateRayTracingPipelinesKHR(
 			copy.pLibraries = libHandles.data();
 			nci.pLibraryInfo = &copy;
 		}
+
+		nci.flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
 	}
 
 	{
diff --git a/src/util/buffmt.cpp b/src/util/buffmt.cpp
index 95d9a0a..f5945d2 100644
--- a/src/util/buffmt.cpp
+++ b/src/util/buffmt.cpp
@@ -82,8 +82,8 @@ Type* buildType(const spc::Compiler& compiler, u32 typeID,
 
 		dst.deco.arrayTypeID = typeID;
 
-		dlg_assert(stype->parent_type);
-		typeID = stype->parent_type;
+		dlg_assert(stype->self);
+		typeID = stype->self;
 		stype = &compiler.get_type(typeID);
 		meta = compiler.get_ir().find_meta(typeID);
 
@@ -448,6 +448,12 @@ void display(const char* name, const Type& type, ReadBuf data, u32 offset) {
 	}
 
 	// display array
+	// we want the dimension with the highest stride first, makes
+	// displaying easier.
+	ThreadMemScope tms;
+	span<u32> array = tms.copy(type.array);
+	std::reverse(array.begin(), array.end());
+
 	displayArrayDim(name, type, type.array, data, offset);
 }
 
@@ -479,6 +485,7 @@ unsigned size(const Type& t, BufferLayout bl) {
 		case Type::typeFloat:
 		case Type::typeUint:
 		case Type::typeInt: {
+			// TODO: matrixStride!
 			auto vec = t.vecsize;
 			if(bl == BufferLayout::std140 && vec == 3u) {
 				vec = 4u;
diff --git a/src/util/buffmt.hpp b/src/util/buffmt.hpp
index 8217a55..70c31ef 100644
--- a/src/util/buffmt.hpp
+++ b/src/util/buffmt.hpp
@@ -35,6 +35,7 @@ struct Decoration {
 	Flags flags {};
 };
 
+NYTL_FLAG_OPS(Decoration::Bits);
 static_assert(std::is_trivially_destructible_v<Decoration>);
 
 // TODO: rename or move to namespace, kinda unfitting here.
@@ -59,7 +60,7 @@ struct Type {
 
 	struct Member {
 		std::string_view name;
-		const Type* type;
+		Type* type;
 		u32 offset;
 	};
 
diff --git a/src/util/bufparser.cpp b/src/util/bufparser.cpp
index 5a872d4..7821052 100644
--- a/src/util/bufparser.cpp
+++ b/src/util/bufparser.cpp
@@ -208,7 +208,7 @@ template<typename Rule> using control = tao::pegtl::must_if<error>::control<Rule
 } // namespace syn
 
 // tree parser
-const Type* parseBuiltin(std::string_view str) {
+Type* parseBuiltin(std::string_view str) {
 	constexpr auto createAtomPair = [](std::string_view name, Type::BaseType type,
 			u32 width, u32 vec, u32 col) {
 		Type ret;
@@ -225,7 +225,7 @@ const Type* parseBuiltin(std::string_view str) {
 		return std::pair(name, ret);
 	};
 
-	static const std::unordered_map<std::string_view, Type> builtins = {
+	static std::unordered_map<std::string_view, Type> builtins = {
 		// base types
 		createAtomPair("float", Type::typeFloat, 32, 1, 1),
 		createAtomPair("f32", Type::typeFloat, 32, 1, 1),
@@ -384,7 +384,7 @@ struct TreeParser {
 		}
 	}
 
-	const Type* parseType(const ParseTreeNode& node) const {
+	Type* parseType(const ParseTreeNode& node) const {
 		checkType<syn::Type>(node);
 		auto name = node.string_view();
 
@@ -402,7 +402,7 @@ struct TreeParser {
 		return t;
 	}
 
-	const Type* applyArrayQualifiers(const ParseTreeNode& quals, const Type& in) {
+	Type* applyArrayQualifiers(const ParseTreeNode& quals, const Type& in) {
 		checkType<syn::ArrayQualifiers>(quals);
 		passert(!quals.children.empty(), quals);
 
@@ -517,7 +517,7 @@ struct TreeParser {
 	// Should probably just use a vector (or map/linked list with
 	// LinearAllocator), we don't have so many types that we need an
 	// unordered map
-	std::unordered_map<std::string_view, const Type*> structs_ {};
+	std::unordered_map<std::string_view, Type*> structs_ {};
 	const Type* main_ {};
 	BufferLayout bufferLayout_ {BufferLayout::std430}; // TODO
 };
diff --git a/src/util/patch.cpp b/src/util/patch.cpp
index 0ed33be..849c4d0 100644
--- a/src/util/patch.cpp
+++ b/src/util/patch.cpp
@@ -1,4 +1,5 @@
 #include "command/alloc.hpp"
+#include <future>
 #include <numeric>
 #include <util/patch.hpp>
 #include <ds.hpp>
@@ -16,9 +17,13 @@ struct InstrBuilder {
 	spv::Op op;
 	std::vector<u32> vals {0}; // first val is reserved
 
+	void prepareWrite() {
+		vals[0] = u16(vals.size()) << 16 | u16(op);
+	}
+
 	[[nodiscard]] u32 insert(std::vector<u32>& dst, u32 off) {
+		prepareWrite();
 		assert(dst.size() >= off);
-		vals[0] = u16(vals.size()) << 16 | u16(op);
 		dst.insert(dst.begin() + off, vals.begin(), vals.end());
 		auto ret = vals.size();
 		vals.clear();
@@ -33,7 +38,7 @@ struct InstrBuilder {
 		auto off = offsets.unnamed[sectionID + 1];
 
 		assert(dst.size() >= off);
-		vals[0] = u16(vals.size()) << 16 | u16(op);
+		prepareWrite();
 		dst.insert(dst.begin() + off, vals.begin(), vals.end());
 
 		// update section counts
@@ -52,6 +57,12 @@ struct InstrBuilder {
 		return *this;
 	}
 
+	template<typename T>
+	InstrBuilder& push(span<T> vals) {
+		for(auto& val : vals) this->push(val);
+		return *this;
+	}
+
 	InstrBuilder& push(std::string_view val) {
 		for(auto i = 0u; i < val.size(); i += 4) {
 			u32 ret = val[i];
@@ -148,9 +159,9 @@ struct ShaderPatch {
 	const Device& dev;
 	const spc::Compiler& compiler;
 	std::vector<u32> copy {};
+	std::vector<u32> newFuncCode {};
 	spc::ParsedIR::SectionOffsets offsets {};
 	u32 freeID {};
-	u32 funcInstrOffset {};
 
 	u32 typeBool = u32(-1);
 	u32 typeFloat = u32(-1);
@@ -208,10 +219,11 @@ struct FuncInstrBuilder : InstrBuilder {
 			return;
 		}
 
-		u32 off = patch_.funcInstrOffset;
-		u32 oldFuncOff = patch_.compiler.get_ir().section_offsets.named.funcs;
-		off += (patch_.offsets.named.funcs - oldFuncOff);
-		patch_.funcInstrOffset += InstrBuilder::insert(patch_.copy, off);
+		InstrBuilder::prepareWrite();
+		patch_.newFuncCode.insert(patch_.newFuncCode.end(),
+			vals.begin(), vals.end());
+		vals.clear();
+
 		written_ = true;
 	}
 };
@@ -394,11 +406,9 @@ void declareConstants(ShaderPatch& patch) {
 }
 
 struct VariableCapture {
-	u32 varID;
 	u32 typeID;
+	u32 loadedID;
 	Type* parsed;
-
-	bool isPointer {};
 	u32 offset {};
 };
 
@@ -409,62 +419,217 @@ bool supportedForCapture(ShaderPatch& patch, const spc::SPIRType& type) {
 		type.basetype <= spc::SPIRType::Struct;
 }
 
-void fixDecorateCaptureType(ShaderPatch& patch, Type& type) {
-	const auto& ir = patch.compiler.get_ir();
+// TODO: should we really modify Type* objects here?
+// maybe clone them first?
+
+struct ProcessedCapture {
+	u32 typeID;
+	span<const u32> ids;
+};
+
+ProcessedCapture processCapture(ShaderPatch& patch, LinAllocScope& tms,
+		Type& type, span<const u32> loadedIDs);
+ProcessedCapture processCaptureNonArray(ShaderPatch& patch, LinAllocScope& tms,
+		Type& type, span<const u32> loadedIDs);
+
+ProcessedCapture processCaptureArray(ShaderPatch& patch, LinAllocScope& tms,
+		Type& type, span<const u32> loadedIDs, span<const u32> remArrayDims,
+		const spc::SPIRType& spcType) {
+	dlg_assert(!remArrayDims.empty());
+
+	auto dimSize = remArrayDims[0];
+	dlg_assert(dimSize > 0);
+	remArrayDims = remArrayDims.subspan(1u);
+
+	auto expanded = tms.alloc<u32>(loadedIDs.size() * dimSize);
+	for(auto i = 0u; i < loadedIDs.size(); ++i) {
+		for(auto j = 0u; j < dimSize; ++j) {
+			expanded[i * dimSize + j] = patch.genOp(spv::OpCompositeExtract,
+				spcType.parent_type, loadedIDs[i], j);
+		}
+	}
+
+	auto subCapture = remArrayDims.empty() ?
+		processCaptureNonArray(patch, tms, type, expanded) :
+		processCaptureArray(patch, tms, type, expanded, remArrayDims,
+			patch.compiler.get_type(spcType.parent_type));
+
+	auto dstTypeID = ++patch.freeID;
+	patch.decl<spv::OpTypeArray>()
+		.push(dstTypeID)
+		.push(subCapture.typeID)
+		.push(u32(dimSize));
+
+	type.deco.arrayStride = align(
+		size(type, patch.bufLayout),
+		align(type, patch.bufLayout));
+	patch.decl<spv::OpDecorate>()
+		.push(dstTypeID)
+		.push(spv::DecorationArrayStride)
+		.push(type.deco.arrayStride);
+
+	for(auto i = 0u; i < loadedIDs.size(); ++i) {
+		auto dstID = ++patch.freeID;
+		auto builder = patch.instr(spv::OpCompositeConstruct);
+		builder.push(dstTypeID);
+		builder.push(dstID);
+
+		for(auto j = 0u; j < dimSize; ++j) {
+			auto srcID = i * dimSize + j;
+			builder.push(subCapture.ids[srcID]);
+		}
+
+		expanded[i] = dstID;
+	}
+
+	ProcessedCapture ret;
+	ret.typeID = dstTypeID;
+	ret.ids = expanded.first(loadedIDs.size());
+	return ret;
+}
+
+ProcessedCapture processCaptureNonArray(ShaderPatch& patch, LinAllocScope& tms,
+		Type& type, span<const u32> loadedIDs) {
+	u32 copiedTypeID = type.deco.typeID;
+	span<const u32> retIDs = loadedIDs;
+
 	if(!type.members.empty()) {
+		copiedTypeID = ++patch.freeID;
+		type.deco.typeID = copiedTypeID;
+
 		dlg_assert(type.type == Type::typeStruct);
+		auto copied = tms.alloc<u32>(loadedIDs.size());
 
-		auto* meta = ir.find_meta(type.deco.typeID);
-		dlg_assert(meta && meta->members.size() == type.members.size());
-		auto needsOffsetDeco = !meta->members[0].decoration_flags.get(spv::DecorationOffset);
+		span<u32> typeIDs = tms.alloc<u32>(type.members.size());
+		span<span<const u32>> memberIDs =
+			tms.alloc<span<const u32>>(type.members.size());
 		auto offset = 0u;
 
 		for(auto [i, member] : enumerate(type.members)) {
-			fixDecorateCaptureType(patch, *const_cast<Type*>(member.type));
+			span<u32> loadedMembers = tms.alloc<u32>(loadedIDs.size());
+			for(auto [j, id] : enumerate(loadedIDs)) {
+				auto memberType = member.type->array.empty() ?
+					member.type->deco.typeID : member.type->deco.arrayTypeID;
+				loadedMembers[j] = patch.genOp(spv::OpCompositeExtract,
+					memberType, id, u32(i));
+			}
 
-			if(needsOffsetDeco) {
-				dlg_assert(!meta->members[0].decoration_flags.get(spv::DecorationOffset));
-				offset = vil::alignPOT(offset, align(type, patch.bufLayout));
-				member.offset = offset;
+			auto capture = processCapture(patch, tms, *member.type, loadedMembers);
+			memberIDs[i] = capture.ids;
+			typeIDs[i] = capture.typeID;
 
+			offset = vil::alignPOT(offset, align(type, patch.bufLayout));
+			patch.decl<spv::OpMemberDecorate>()
+				.push(copiedTypeID)
+				.push(u32(i))
+				.push(spv::DecorationOffset)
+				.push(offset);
+
+			member.offset = offset;
+			auto memberSize = size(*member.type, patch.bufLayout);
+
+			if(type.columns > 1u) {
 				patch.decl<spv::OpMemberDecorate>()
-					.push(type.deco.typeID)
+					.push(copiedTypeID)
 					.push(u32(i))
-					.push(spv::DecorationOffset)
-					.push(offset);
+					.push(spv::DecorationRowMajor);
 
-				auto dstSize = size(*member.type, patch.bufLayout);
-				offset += dstSize;
+				member.type->deco.flags &= ~Decoration::Bits::colMajor;
+				member.type->deco.flags |= Decoration::Bits::rowMajor;
+
+				auto nc = member.type->vecsize;
+				if (patch.bufLayout == BufferLayout::std140 && nc == 3u) {
+					nc = 4u;
+				}
+				auto matrixStride = nc * member.type->width / 32u;
+				member.type->deco.matrixStride = matrixStride;
+				patch.decl<spv::OpMemberDecorate>()
+					.push(copiedTypeID)
+					.push(u32(i))
+					.push(spv::DecorationMatrixStride)
+					.push(nc);
 			}
+
+			offset += memberSize;
 		}
-	}
 
-	if(!type.array.empty()) {
-		dlg_assert(type.deco.arrayTypeID != 0u);
-		auto* meta = ir.find_meta(type.deco.arrayTypeID);
-		if(!meta || !meta->decoration.decoration_flags.get(spv::DecorationArrayStride)) {
-			dlg_assert(type.deco.arrayStride == 0u);
+		patch.decl<spv::OpTypeStruct>()
+			.push(copiedTypeID)
+			.push(typeIDs);
 
-			auto tarray = type.array;
-			type.array = {};
-			type.deco.arrayStride = align(
-				size(type, patch.bufLayout),
-				align(type, patch.bufLayout));
-			type.array = tarray;
+		auto structBuild = tms.alloc<u32>(type.members.size());
+		for(auto [i, dst] : enumerate(copied)) {
+			for(auto [j, ids] : enumerate(memberIDs)) {
+				structBuild[j] = ids[i];
+			}
+			dst = patch.genOp(spv::OpCompositeConstruct, copiedTypeID, structBuild);
+		}
 
-			patch.decl<spv::OpDecorate>()
-				.push(type.deco.arrayTypeID)
-				.push(spv::DecorationArrayStride)
-				.push(type.deco.arrayStride);
-		} else {
-			dlg_assert(type.deco.arrayStride);
+		retIDs = copied;
+	} else if(type.type == Type::typeBool) {
+		type.type = Type::typeUint;
+		type.width = 32u;
+		type.deco.typeID = patch.typeUint;
+		copiedTypeID = patch.typeUint;
+
+		auto copied = tms.alloc<u32>(loadedIDs.size());
+		for(auto [i, src] : enumerate(loadedIDs)) {
+			copied[i] = patch.genOp(spv::OpSelect, patch.typeUint,
+				src, patch.const1, patch.const0);
 		}
+
+		retIDs = copied;
+	}
+
+	ProcessedCapture ret;
+	ret.typeID = copiedTypeID;
+	ret.ids = retIDs;
+	return ret;
+}
+
+ProcessedCapture processCapture(ShaderPatch& patch, LinAllocScope& tms,
+		Type& type, span<const u32> loadedIDs) {
+	// TODO: error-out for pointers.
+	//   make sure to just ignore members that are pointers.
+
+	if(type.array.empty()) {
+		return processCaptureNonArray(patch, tms, type, loadedIDs);
+	}
+
+	span<u32> array = tms.copy(type.array);
+	std::reverse(array.begin(), array.end());
+	return processCaptureArray(patch, tms, type, loadedIDs, array,
+		patch.compiler.get_type(type.deco.arrayTypeID));
+}
+
+struct VarCapture {
+	Type* type;
+	u32 resTypeID;
+	u32 loaded;
+};
+
+VarCapture processCapture(ShaderPatch& patch, u32 varID, u32 typeID) {
+	auto& compiler = patch.compiler;
+	auto& ir = compiler.get_ir();
+	auto& srcType = ir.get<spc::SPIRType>(typeID);
+	if(!supportedForCapture(patch, srcType)) {
+		return {};
 	}
 
-	// TODO: matrixStride
-	if(type.columns > 1u) {
-		dlg_error("TODO: add matrixstride deco");
+	auto loaded = varID;
+
+	if(srcType.pointer) {
+		dlg_assert(srcType.pointer_depth == 1u);
+		typeID = srcType.parent_type;
+		loaded = patch.genOp(spv::OpLoad, typeID, varID);
 	}
+
+	auto* parsedType = buildType(patch.compiler, typeID, patch.alloc);
+	ThreadMemScope tms;
+
+	auto captureRes = processCapture(patch, tms, *parsedType, {{loaded}});
+	dlg_assert(captureRes.ids.size() == 1u);
+	return {parsedType, captureRes.typeID, captureRes.ids[0]};
 }
 
 u32 findBuiltin(ShaderPatch& patch,
@@ -546,7 +711,7 @@ u32 generateInvocationVertex(ShaderPatch& patch,
 
 		auto& ir = patch.compiler.get_ir();
 
-		auto extName = "SPV_KHR_shader_draw_parameters";
+		auto extName = std::string_view("SPV_KHR_shader_draw_parameters");
 		if(!contains(ir.declared_extensions, extName)) {
 			patch.decl<spv::OpExtension>().push(extName);
 		}
@@ -614,6 +779,60 @@ u32 generateCurrentInvocation(ShaderPatch& patch,
 	}
 }
 
+void fixPhiInstructions(ShaderPatch& patch, u32 dstBlock,
+		u32 oldBlockEnd, u32 newBlockEnd) {
+	auto& ir = patch.compiler.get_ir();
+	auto& block = ir.get<spc::SPIRBlock>(dstBlock);
+
+	for(auto [phiOff] : block.phi_instructions) {
+		auto eOffset = patch.offsets.named.funcs - ir.section_offsets.named.funcs;
+		auto toff = phiOff + eOffset;
+		auto& instrHead = patch.copy[toff];
+		auto numWords = (instrHead >> 16u) & 0xFFFFu;
+		dlg_assert((instrHead & 0xFFFFu) == spv::OpPhi);
+
+		// auto resType = patch.copy[toff + 1];
+		// auto resID = patch.copy[toff + 2];
+
+		auto found = false;
+		for(auto i = 3u; i + 1 < numWords; i += 2) {
+			// auto parentVar = patch.copy[toff + i + 0];
+			auto parentBlock = patch.copy[toff + i + 1];
+			if(parentBlock == oldBlockEnd) {
+				dlg_assert(!found);
+				found = true;
+
+				patch.copy[toff + i + 1] = newBlockEnd;
+			}
+		}
+
+		dlg_assert(found);
+	}
+}
+
+void fixPhiInstructions(ShaderPatch& path, spc::SPIRBlock& srcBlock,
+		u32 newBlockEnd) {
+	auto oldBlockEnd = srcBlock.self;
+	if(srcBlock.terminator == spc::SPIRBlock::Direct) {
+		fixPhiInstructions(path, srcBlock.next_block,
+			oldBlockEnd, newBlockEnd);
+	} else if(srcBlock.terminator == spc::SPIRBlock::Select) {
+		fixPhiInstructions(path, srcBlock.true_block,
+			oldBlockEnd, newBlockEnd);
+		fixPhiInstructions(path, srcBlock.false_block,
+			oldBlockEnd, newBlockEnd);
+	} else if(srcBlock.terminator == spc::SPIRBlock::MultiSelect) {
+		for(auto& caseBlock : srcBlock.cases_32bit) {
+			fixPhiInstructions(path, caseBlock.block,
+				oldBlockEnd, newBlockEnd);
+		}
+		for(auto& caseBlock : srcBlock.cases_64bit) {
+			fixPhiInstructions(path, caseBlock.block,
+				oldBlockEnd, newBlockEnd);
+		}
+	}
+}
+
 PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler,
 		u32 file, u32 line,
 		u64 captureAddress, const std::string& entryPointName,
@@ -667,19 +886,19 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler,
 		addressing = u32(spv::AddressingModelPhysicalStorageBuffer64);
 	}
 
-	findDeclareBaseTypes(patch);
-	declareConstants(patch);
+	auto capName = spv::CapabilityPhysicalStorageBufferAddresses;
+	if(!contains(ir.declared_capabilities, capName)) {
+		patch.decl<spv::OpCapability>().push(capName);
+	}
 
 	// allow us to use physical storage buffer pointers
-	auto extName = "SPV_KHR_physical_storage_buffer";
+	auto extName = std::string_view("SPV_KHR_physical_storage_buffer");
 	if(!contains(ir.declared_extensions, extName)) {
 		patch.decl<spv::OpExtension>().push(extName);
 	}
 
-	auto capName = spv::CapabilityPhysicalStorageBufferAddresses;
-	if(!contains(ir.declared_capabilities, capName)) {
-		patch.decl<spv::OpCapability>() .push(capName);
-	}
+	findDeclareBaseTypes(patch);
+	declareConstants(patch);
 
 	// parse local variables
 	vil::Type baseType;
@@ -691,18 +910,20 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler,
 	auto offset = 0u;
 	std::vector<VariableCapture> captures;
 
-	auto addCapture = [&](u32 varID, u32 typeID, bool isPointer, const std::string& name) {
-		auto* parsedType = buildType(patch.compiler, typeID, patch.alloc);
-		fixDecorateCaptureType(patch, *parsedType);
+	auto addCapture = [&](u32 varID, u32 typeID, const std::string& name) {
+		auto [parsedType, loadedType, loaded] = processCapture(patch, varID, typeID);
+
+		if(!parsedType) {
+			return;
+		}
 
 		offset = alignPOT(offset, align(*parsedType, patch.bufLayout));
 
 		auto& capture = captures.emplace_back();
 		capture.parsed = parsedType;
-		capture.typeID = typeID;
-		capture.varID = varID;
 		capture.offset = offset;
-		capture.isPointer = isPointer;
+		capture.typeID = loadedType;
+		capture.loadedID = loaded;
 
 		auto& member = members.emplace_back();
 		member.type = capture.parsed;
@@ -730,15 +951,7 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler,
 			continue;
 		}
 
-		auto& srcType = ir.get<spc::SPIRType>(var.basetype);
-		if(!supportedForCapture(patch, srcType)) {
-			continue;
-		}
-
-		dlg_assert(srcType.pointer);
-		dlg_assert(srcType.pointer_depth == 1u);
-
-		addCapture(varID, srcType.parent_type, true, name);
+		addCapture(varID, var.basetype, name);
 	}
 
 	// capture function arguments
@@ -748,20 +961,7 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler,
 			continue;
 		}
 
-		auto& srcType = ir.get<spc::SPIRType>(param.type);
-		if(!supportedForCapture(patch, srcType)) {
-			continue;
-		}
-
-		auto typeID = param.type;
-		bool isPointer = false;
-		if(srcType.pointer) {
-			dlg_assert(srcType.pointer_depth == 1u);
-			typeID = srcType.parent_type;
-			isPointer = true;
-		}
-
-		addCapture(param.id, typeID, isPointer, name);
+		addCapture(param.id, param.type, name);
 	}
 
 	// declare that struct type in spirv [patch]
@@ -815,28 +1015,14 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler,
 		.push(addressConstLow)
 		.push(addressConstHigh);
 
-	// find builtin GlobalInvocationID
-	//  - construct struct C via OpCompositeConstruct
-	patch.funcInstrOffset = lb->offset;
+	//  construct struct C via OpCompositeConstruct
 	auto srcStruct = ++freeID;
 	{
 		auto builder = patch.instr(spv::OpCompositeConstruct);
 		builder.push(captureStruct);
 		builder.push(srcStruct);
 		for(auto [i, capture] : enumerate(captures)) {
-			u32 memID {};
-
-			if(capture.isPointer) {
-				memID = ++freeID;
-				patch.instr(spv::OpLoad)
-					.push(capture.typeID)
-					.push(memID)
-					.push(capture.varID);
-			} else {
-				memID = capture.varID;
-			}
-
-			builder.push(memID);
+			builder.push(capture.loadedID);
 		}
 	}
 
@@ -959,6 +1145,15 @@ PatchResult patchShaderCapture(const Device& dev, const spc::Compiler& compiler,
 	// rest of the current block
 	patch.instr(spv::OpLabel, blockRest);
 
+	// we changed control flow, have to fix phi instructions in following
+	// blocks.
+	fixPhiInstructions(patch, *lb->block, blockRest);
+
+	// insert new function instructions
+	auto off = (patch.offsets.named.funcs - ir.section_offsets.named.funcs);
+	copy.insert(copy.begin() + lb->offset + off,
+		patch.newFuncCode.begin(), patch.newFuncCode.end());
+
 	// update interface of entry point
 	auto eOffset = patch.offsets.named.entry_points - ir.section_offsets.named.entry_points;
 	auto& instrHead = patch.copy[entryPoint.offset + eOffset];
@@ -1111,6 +1306,15 @@ vku::Pipeline createPatchCopy(const RayTracingPipeline& src,
 	rti.layout = src.layout->handle;
 	rti.maxPipelineRayRecursionDepth = src.maxPipelineRayRecursionDepth;
 
+	// TODO DATA RACE!
+	/*
+	if(src.handle) {
+		rti.basePipelineHandle = src.handle;
+		rti.flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT;
+		rti.basePipelineIndex = -1;
+	}
+	*/
+
 	std::vector<VkDynamicState> dynStates {
 		src.dynamicState.begin(), src.dynamicState.end()};
 	VkPipelineDynamicStateCreateInfo dynState {};
@@ -1140,7 +1344,49 @@ vku::Pipeline createPatchCopy(const RayTracingPipeline& src,
 	rti.groupCount = groups.size();
 	rti.pGroups = groups.data();
 
-	auto pipe = vku::Pipeline(dev, rti);
+	vku::Pipeline pipe;
+	constexpr auto useDeferredOp = false;
+
+	if(useDeferredOp) {
+		VkDeferredOperationKHR dop {};
+		VK_CHECK(dev.dispatch.CreateDeferredOperationKHR(dev.handle, nullptr, &dop));
+
+		VkPipeline vkPipe;
+		auto res = dev.dispatch.CreateRayTracingPipelinesKHR(dev.handle, dop, {}, 1u,
+			&rti, nullptr, &vkPipe);
+		dlg_assert(res == VK_OPERATION_DEFERRED_KHR);
+
+		auto maxThreads = dev.dispatch.GetDeferredOperationMaxConcurrencyKHR(
+			dev.handle, dop);
+		dlg_trace("maxThreads: {}", maxThreads);
+
+		auto cb = [&]{
+			auto res = dev.dispatch.DeferredOperationJoinKHR(dev.handle, dop);
+			dlg_trace("res: {}", res);
+		};
+
+		std::vector<std::future<void>> futures;
+		for(auto i = 0u; i < std::min(7u, maxThreads - 1); ++i) {
+			futures.emplace_back(std::async(std::launch::async, cb));
+		}
+
+		do {
+			res = dev.dispatch.DeferredOperationJoinKHR(dev.handle, dop);
+		} while(res == VK_THREAD_IDLE_KHR);
+
+		dlg_trace("main res {}", res);
+
+		for(auto& future : futures) {
+			future.get();
+		}
+
+		res = dev.dispatch.GetDeferredOperationResultKHR(dev.handle, dop);
+		dlg_assert(res == VK_SUCCESS);
+
+		pipe = vku::Pipeline(dev, vkPipe);
+	} else {
+		pipe = vku::Pipeline(dev, rti);
+	}
 
 	auto handleSize = dev.rtProps.shaderGroupHandleSize;
 	dlg_assert(handleSize % 4u == 0u);
@@ -1254,6 +1500,8 @@ PatchJobResult patchJob(PatchJobData& data) {
 	output += std::to_string(hash);
 	output += ".spv";
 	writeFile(output.c_str(), bytes(patchRes.copy), true);
+
+	dlg_trace("Wrote {}, {} bytes", output, bytes(patchRes.copy).size());
 #endif // VIL_OUTPUT_PATCHED_SPIRV
 
 	OwnBuffer shaderTableMapping;