Get rid of obsolete CPU <-> GPU dma sync path.

Always use ReBAR when available, this has been dead code since forever.
Themaister · Dec 12, 2023 · cc51654 · cc51654
1 parent 1f357ff
commit cc51654
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 121 deletions.
diff --git a/application/scene_viewer_application.cpp b/application/scene_viewer_application.cpp
@@ -148,8 +148,6 @@ void SceneViewerApplication::read_quirks(const std::string &path)
 		ImplementationQuirks::get().use_transient_depth_stencil = doc["useTransientDepthStencil"].GetBool();
 	if (doc.HasMember("queueWaitOnSubmission"))
 		ImplementationQuirks::get().queue_wait_on_submission = doc["queueWaitOnSubmission"].GetBool();
-	if (doc.HasMember("stagingNeedDeviceLocal"))
-		ImplementationQuirks::get().staging_need_device_local = doc["stagingNeedDeviceLocal"].GetBool();
 	if (doc.HasMember("useAsyncComputePost"))
 		ImplementationQuirks::get().use_async_compute_post = doc["useAsyncComputePost"].GetBool();
 	if (doc.HasMember("renderGraphForceSingleQueue"))

diff --git a/vulkan/buffer_pool.cpp b/vulkan/buffer_pool.cpp
@@ -28,14 +28,12 @@
 namespace Vulkan
 {
 void BufferPool::init(Device *device_, VkDeviceSize block_size_,
-                      VkDeviceSize alignment_, VkBufferUsageFlags usage_,
-                      bool need_device_local_)
+                      VkDeviceSize alignment_, VkBufferUsageFlags usage_)
 {
 	device = device_;
 	block_size = block_size_;
 	alignment = alignment_;
 	usage = usage_;
-	need_device_local = need_device_local_;
 }
 
 void BufferPool::set_spill_region_size(VkDeviceSize spill_size_)
@@ -59,40 +57,22 @@ void BufferPool::reset()
 
 BufferBlock BufferPool::allocate_block(VkDeviceSize size)
 {
-	BufferDomain ideal_domain = need_device_local ?
-	                            BufferDomain::Device :
-	                            ((usage & VK_BUFFER_USAGE_TRANSFER_SRC_BIT) != 0) ? BufferDomain::Host : BufferDomain::LinkedDeviceHost;
-
-	VkBufferUsageFlags extra_usage = ideal_domain == BufferDomain::Device ? VK_BUFFER_USAGE_TRANSFER_DST_BIT : 0;
+	BufferDomain ideal_domain = ((usage & VK_BUFFER_USAGE_TRANSFER_SRC_BIT) != 0) ?
+			BufferDomain::Host : BufferDomain::LinkedDeviceHost;
 
 	BufferBlock block;
 
 	BufferCreateInfo info;
 	info.domain = ideal_domain;
 	info.size = size;
-	info.usage = usage | extra_usage;
+	info.usage = usage;
 
-	block.gpu = device->create_buffer(info, nullptr);
-	device->set_name(*block.gpu, "chain-allocated-block-gpu");
-	block.gpu->set_internal_sync_object();
+	block.buffer = device->create_buffer(info, nullptr);
+	device->set_name(*block.buffer, "chain-allocated-block");
+	block.buffer->set_internal_sync_object();
 
 	// Try to map it, will fail unless the memory is host visible.
-	block.mapped = static_cast<uint8_t *>(device->map_host_buffer(*block.gpu, MEMORY_ACCESS_WRITE_BIT));
-	if (!block.mapped)
-	{
-		// Fall back to host memory, and remember to sync to gpu on submission time using DMA queue. :)
-		BufferCreateInfo cpu_info;
-		cpu_info.domain = BufferDomain::Host;
-		cpu_info.size = size;
-		cpu_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
-
-		block.cpu = device->create_buffer(cpu_info, nullptr);
-		block.cpu->set_internal_sync_object();
-		device->set_name(*block.cpu, "chain-allocated-block-cpu");
-		block.mapped = static_cast<uint8_t *>(device->map_host_buffer(*block.cpu, MEMORY_ACCESS_WRITE_BIT));
-	}
-	else
-		block.cpu = block.gpu;
+	block.mapped = static_cast<uint8_t *>(device->map_host_buffer(*block.buffer, MEMORY_ACCESS_WRITE_BIT));
 
 	block.offset = 0;
 	block.alignment = alignment;
@@ -112,7 +92,7 @@ BufferBlock BufferPool::request_block(VkDeviceSize minimum_size)
 		auto back = std::move(blocks.back());
 		blocks.pop_back();
 
-		back.mapped = static_cast<uint8_t *>(device->map_host_buffer(*back.cpu, MEMORY_ACCESS_WRITE_BIT));
+		back.mapped = static_cast<uint8_t *>(device->map_host_buffer(*back.buffer, MEMORY_ACCESS_WRITE_BIT));
 		back.offset = 0;
 		return back;
 	}
@@ -132,5 +112,4 @@ BufferPool::~BufferPool()
 {
 	VK_ASSERT(blocks.empty());
 }
-
 }
diff --git a/vulkan/buffer_pool.hpp b/vulkan/buffer_pool.hpp
@@ -42,8 +42,7 @@ struct BufferBlockAllocation
 struct BufferBlock
 {
 	~BufferBlock();
-	Util::IntrusivePtr<Buffer> gpu;
-	Util::IntrusivePtr<Buffer> cpu;
+	Util::IntrusivePtr<Buffer> buffer;
 	VkDeviceSize offset = 0;
 	VkDeviceSize alignment = 0;
 	VkDeviceSize size = 0;
@@ -72,7 +71,7 @@ class BufferPool
 {
 public:
 	~BufferPool();
-	void init(Device *device, VkDeviceSize block_size, VkDeviceSize alignment, VkBufferUsageFlags usage, bool need_device_local);
+	void init(Device *device, VkDeviceSize block_size, VkDeviceSize alignment, VkBufferUsageFlags usage);
 	void reset();
 
 	// Used for allocating UBOs, where we want to specify a fixed size for range,
@@ -97,6 +96,5 @@ class BufferPool
 	size_t max_retained_blocks = 0;
 	std::vector<BufferBlock> blocks;
 	BufferBlock allocate_block(VkDeviceSize size);
-	bool need_device_local = false;
 };
 }
diff --git a/vulkan/command_buffer.cpp b/vulkan/command_buffer.cpp
@@ -2181,7 +2181,7 @@ void *CommandBuffer::allocate_constant_data(unsigned set, unsigned binding, VkDe
 		device->request_uniform_block(ubo_block, size);
 		data = ubo_block.allocate(size);
 	}
-	set_uniform_buffer(set, binding, *ubo_block.gpu, data.offset, data.padded_size);
+	set_uniform_buffer(set, binding, *ubo_block.buffer, data.offset, data.padded_size);
 	return data.host;
 }
 
@@ -2193,7 +2193,7 @@ void *CommandBuffer::allocate_index_data(VkDeviceSize size, VkIndexType index_ty
 		device->request_index_block(ibo_block, size);
 		data = ibo_block.allocate(size);
 	}
-	set_index_buffer(*ibo_block.gpu, data.offset, index_type);
+	set_index_buffer(*ibo_block.buffer, data.offset, index_type);
 	return data.host;
 }
 
@@ -2208,7 +2208,7 @@ void *CommandBuffer::update_buffer(const Buffer &buffer, VkDeviceSize offset, Vk
 		device->request_staging_block(staging_block, size);
 		data = staging_block.allocate(size);
 	}
-	copy_buffer(buffer, offset, *staging_block.cpu, data.offset, size);
+	copy_buffer(buffer, offset, *staging_block.buffer, data.offset, size);
 	return data.host;
 }
 
@@ -2248,7 +2248,7 @@ void *CommandBuffer::update_image(const Image &image, const VkOffset3D &offset,
 		data = staging_block.allocate(size);
 	}
 
-	copy_buffer_to_image(image, *staging_block.cpu, data.offset, offset, extent, row_length, image_height, subresource);
+	copy_buffer_to_image(image, *staging_block.buffer, data.offset, offset, extent, row_length, image_height, subresource);
 	return data.host;
 }
 
@@ -2271,7 +2271,7 @@ void *CommandBuffer::allocate_vertex_data(unsigned binding, VkDeviceSize size, V
 		data = vbo_block.allocate(size);
 	}
 
-	set_vertex_binding(binding, *vbo_block.gpu, data.offset, stride, step_rate);
+	set_vertex_binding(binding, *vbo_block.buffer, data.offset, stride, step_rate);
 	return data.host;
 }
 

diff --git a/vulkan/device.cpp b/vulkan/device.cpp
@@ -945,17 +945,13 @@ void Device::set_context(const Context &context)
 	managers.semaphore.init(this);
 	managers.fence.init(this);
 	managers.event.init(this);
-	managers.vbo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-	                  ImplementationQuirks::get().staging_need_device_local);
-	managers.ibo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
-	                  ImplementationQuirks::get().staging_need_device_local);
+	managers.vbo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
+	managers.ibo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
 	managers.ubo.init(this, 256 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.minUniformBufferOffsetAlignment),
-	                  VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
-	                  ImplementationQuirks::get().staging_need_device_local);
+	                  VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
 	managers.ubo.set_spill_region_size(VULKAN_MAX_UBO_SIZE);
 	managers.staging.init(this, 64 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.optimalBufferCopyOffsetAlignment),
-	                      VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
-	                      false);
+	                      VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
 
 	managers.vbo.set_max_retained_blocks(256);
 	managers.ibo.set_max_retained_blocks(256);
@@ -1145,10 +1141,13 @@ void Device::init_stock_samplers()
 }
 
 static void request_block(Device &device, BufferBlock &block, VkDeviceSize size,
-                          BufferPool &pool, std::vector<BufferBlock> *dma, std::vector<BufferBlock> &recycle)
+                          BufferPool &pool, std::vector<BufferBlock> &recycle)
 {
 	if (block.mapped)
-		device.unmap_host_buffer(*block.cpu, MEMORY_ACCESS_WRITE_BIT);
+	{
+		device.unmap_host_buffer(*block.buffer, MEMORY_ACCESS_WRITE_BIT);
+		block.mapped = nullptr;
+	}
 
 	if (block.offset == 0)
 	{
@@ -1157,12 +1156,6 @@ static void request_block(Device &device, BufferBlock &block, VkDeviceSize size,
 	}
 	else
 	{
-		if (block.cpu != block.gpu)
-		{
-			VK_ASSERT(dma);
-			dma->push_back(block);
-		}
-
 		if (block.size == pool.get_block_size())
 			recycle.push_back(block);
 	}
@@ -1181,7 +1174,7 @@ void Device::request_vertex_block(BufferBlock &block, VkDeviceSize size)
 
 void Device::request_vertex_block_nolock(BufferBlock &block, VkDeviceSize size)
 {
-	request_block(*this, block, size, managers.vbo, &dma.vbo, frame().vbo_blocks);
+	request_block(*this, block, size, managers.vbo, frame().vbo_blocks);
 }
 
 void Device::request_index_block(BufferBlock &block, VkDeviceSize size)
@@ -1192,7 +1185,7 @@ void Device::request_index_block(BufferBlock &block, VkDeviceSize size)
 
 void Device::request_index_block_nolock(BufferBlock &block, VkDeviceSize size)
 {
-	request_block(*this, block, size, managers.ibo, &dma.ibo, frame().ibo_blocks);
+	request_block(*this, block, size, managers.ibo, frame().ibo_blocks);
 }
 
 void Device::request_uniform_block(BufferBlock &block, VkDeviceSize size)
@@ -1203,7 +1196,7 @@ void Device::request_uniform_block(BufferBlock &block, VkDeviceSize size)
 
 void Device::request_uniform_block_nolock(BufferBlock &block, VkDeviceSize size)
 {
-	request_block(*this, block, size, managers.ubo, &dma.ubo, frame().ubo_blocks);
+	request_block(*this, block, size, managers.ubo, frame().ubo_blocks);
 }
 
 void Device::request_staging_block(BufferBlock &block, VkDeviceSize size)
@@ -1214,7 +1207,7 @@ void Device::request_staging_block(BufferBlock &block, VkDeviceSize size)
 
 void Device::request_staging_block_nolock(BufferBlock &block, VkDeviceSize size)
 {
-	request_block(*this, block, size, managers.staging, nullptr, frame().staging_blocks);
+	request_block(*this, block, size, managers.staging, frame().staging_blocks);
 }
 
 void Device::submit(CommandBufferHandle &cmd, Fence *fence, unsigned semaphore_count, Semaphore *semaphores)
@@ -1339,9 +1332,6 @@ void Device::submit_empty(CommandBuffer::Type type, Fence *fence, SemaphoreHolde
 void Device::submit_empty_nolock(QueueIndices physical_type, Fence *fence,
                                  SemaphoreHolder *semaphore, int profiling_iteration)
 {
-	if (physical_type != QUEUE_INDEX_TRANSFER)
-		flush_frame(QUEUE_INDEX_TRANSFER);
-
 	InternalFence signalled_fence = {};
 
 	submit_queue(physical_type, fence ? &signalled_fence : nullptr, semaphore,
@@ -1717,10 +1707,6 @@ void Device::submit_queue(QueueIndices physical_type, InternalFence *fence,
                           SemaphoreHolder *external_semaphore,
                           unsigned semaphore_count, Semaphore *semaphores, int profiling_iteration)
 {
-	// Always check if we need to flush pending transfers.
-	if (physical_type != QUEUE_INDEX_TRANSFER)
-		flush_frame(QUEUE_INDEX_TRANSFER);
-
 	auto &data = queue_data[physical_type];
 	auto &submissions = frame().submissions[physical_type];
 
@@ -1822,49 +1808,8 @@ void Device::submit_queue(QueueIndices physical_type, InternalFence *fence,
 
 void Device::flush_frame(QueueIndices physical_type)
 {
-	if (queue_info.queues[physical_type] == VK_NULL_HANDLE)
-		return;
-
-	if (physical_type == QUEUE_INDEX_TRANSFER)
-		sync_buffer_blocks();
-	submit_queue(physical_type, nullptr);
-}
-
-void Device::sync_buffer_blocks()
-{
-	if (dma.vbo.empty() && dma.ibo.empty() && dma.ubo.empty())
-		return;
-
-	auto cmd = request_command_buffer_nolock(get_thread_index(), CommandBuffer::Type::AsyncTransfer, false);
-	cmd->begin_region("buffer-block-sync");
-
-	for (auto &block : dma.vbo)
-	{
-		VK_ASSERT(block.offset != 0);
-		cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
-	}
-
-	for (auto &block : dma.ibo)
-	{
-		VK_ASSERT(block.offset != 0);
-		cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
-	}
-
-	for (auto &block : dma.ubo)
-	{
-		VK_ASSERT(block.offset != 0);
-		cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
-	}
-
-	dma.vbo.clear();
-	dma.ibo.clear();
-	dma.ubo.clear();
-
-	cmd->end_region();
-
-	// Do not flush graphics or compute in this context.
-	// We must be able to inject semaphores into all currently enqueued graphics / compute.
-	submit_staging(cmd, false);
+	if (queue_info.queues[physical_type] != VK_NULL_HANDLE)
+		submit_queue(physical_type, nullptr);
 }
 
 void Device::end_frame_context()

diff --git a/vulkan/device.hpp b/vulkan/device.hpp
@@ -691,14 +691,6 @@ class Device
 		uint64_t value;
 	};
 
-	// Pending buffers which need to be copied from CPU to GPU before submitting graphics or compute work.
-	struct
-	{
-		std::vector<BufferBlock> vbo;
-		std::vector<BufferBlock> ibo;
-		std::vector<BufferBlock> ubo;
-	} dma;
-
 	void submit_queue(QueueIndices physical_type, InternalFence *fence,
 	                  SemaphoreHolder *external_semaphore = nullptr,
 	                  unsigned semaphore_count = 0,
@@ -753,7 +745,6 @@ class Device
 	std::function<void ()> queue_lock_callback;
 	std::function<void ()> queue_unlock_callback;
 	void flush_frame(QueueIndices physical_type);
-	void sync_buffer_blocks();
 	void submit_empty_inner(QueueIndices type, InternalFence *fence,
 	                        SemaphoreHolder *external_semaphore,
 	                        unsigned semaphore_count,

diff --git a/vulkan/quirks.hpp b/vulkan/quirks.hpp
@@ -31,7 +31,6 @@ struct ImplementationQuirks
 	bool use_transient_color = true;
 	bool use_transient_depth_stencil = true;
 	bool queue_wait_on_submission = false;
-	bool staging_need_device_local = false;
 	bool use_async_compute_post = true;
 	bool render_graph_force_single_queue = false;
 	bool force_no_subgroups = false;