Skip to content

Commit

Permalink
Get rid of obsolete CPU <-> GPU dma sync path.
Browse files Browse the repository at this point in the history
Always use ReBAR when available, this has been dead code since forever.
  • Loading branch information
Themaister committed Dec 12, 2023
1 parent 1f357ff commit cc51654
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 121 deletions.
2 changes: 0 additions & 2 deletions application/scene_viewer_application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,6 @@ void SceneViewerApplication::read_quirks(const std::string &path)
ImplementationQuirks::get().use_transient_depth_stencil = doc["useTransientDepthStencil"].GetBool();
if (doc.HasMember("queueWaitOnSubmission"))
ImplementationQuirks::get().queue_wait_on_submission = doc["queueWaitOnSubmission"].GetBool();
if (doc.HasMember("stagingNeedDeviceLocal"))
ImplementationQuirks::get().staging_need_device_local = doc["stagingNeedDeviceLocal"].GetBool();
if (doc.HasMember("useAsyncComputePost"))
ImplementationQuirks::get().use_async_compute_post = doc["useAsyncComputePost"].GetBool();
if (doc.HasMember("renderGraphForceSingleQueue"))
Expand Down
39 changes: 9 additions & 30 deletions vulkan/buffer_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,12 @@
namespace Vulkan
{
void BufferPool::init(Device *device_, VkDeviceSize block_size_,
VkDeviceSize alignment_, VkBufferUsageFlags usage_,
bool need_device_local_)
VkDeviceSize alignment_, VkBufferUsageFlags usage_)
{
device = device_;
block_size = block_size_;
alignment = alignment_;
usage = usage_;
need_device_local = need_device_local_;
}

void BufferPool::set_spill_region_size(VkDeviceSize spill_size_)
Expand All @@ -59,40 +57,22 @@ void BufferPool::reset()

BufferBlock BufferPool::allocate_block(VkDeviceSize size)
{
BufferDomain ideal_domain = need_device_local ?
BufferDomain::Device :
((usage & VK_BUFFER_USAGE_TRANSFER_SRC_BIT) != 0) ? BufferDomain::Host : BufferDomain::LinkedDeviceHost;

VkBufferUsageFlags extra_usage = ideal_domain == BufferDomain::Device ? VK_BUFFER_USAGE_TRANSFER_DST_BIT : 0;
BufferDomain ideal_domain = ((usage & VK_BUFFER_USAGE_TRANSFER_SRC_BIT) != 0) ?
BufferDomain::Host : BufferDomain::LinkedDeviceHost;

BufferBlock block;

BufferCreateInfo info;
info.domain = ideal_domain;
info.size = size;
info.usage = usage | extra_usage;
info.usage = usage;

block.gpu = device->create_buffer(info, nullptr);
device->set_name(*block.gpu, "chain-allocated-block-gpu");
block.gpu->set_internal_sync_object();
block.buffer = device->create_buffer(info, nullptr);
device->set_name(*block.buffer, "chain-allocated-block");
block.buffer->set_internal_sync_object();

// Try to map it, will fail unless the memory is host visible.
block.mapped = static_cast<uint8_t *>(device->map_host_buffer(*block.gpu, MEMORY_ACCESS_WRITE_BIT));
if (!block.mapped)
{
// Fall back to host memory, and remember to sync to gpu on submission time using DMA queue. :)
BufferCreateInfo cpu_info;
cpu_info.domain = BufferDomain::Host;
cpu_info.size = size;
cpu_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;

block.cpu = device->create_buffer(cpu_info, nullptr);
block.cpu->set_internal_sync_object();
device->set_name(*block.cpu, "chain-allocated-block-cpu");
block.mapped = static_cast<uint8_t *>(device->map_host_buffer(*block.cpu, MEMORY_ACCESS_WRITE_BIT));
}
else
block.cpu = block.gpu;
block.mapped = static_cast<uint8_t *>(device->map_host_buffer(*block.buffer, MEMORY_ACCESS_WRITE_BIT));

block.offset = 0;
block.alignment = alignment;
Expand All @@ -112,7 +92,7 @@ BufferBlock BufferPool::request_block(VkDeviceSize minimum_size)
auto back = std::move(blocks.back());
blocks.pop_back();

back.mapped = static_cast<uint8_t *>(device->map_host_buffer(*back.cpu, MEMORY_ACCESS_WRITE_BIT));
back.mapped = static_cast<uint8_t *>(device->map_host_buffer(*back.buffer, MEMORY_ACCESS_WRITE_BIT));
back.offset = 0;
return back;
}
Expand All @@ -132,5 +112,4 @@ BufferPool::~BufferPool()
{
VK_ASSERT(blocks.empty());
}

}
6 changes: 2 additions & 4 deletions vulkan/buffer_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ struct BufferBlockAllocation
struct BufferBlock
{
~BufferBlock();
Util::IntrusivePtr<Buffer> gpu;
Util::IntrusivePtr<Buffer> cpu;
Util::IntrusivePtr<Buffer> buffer;
VkDeviceSize offset = 0;
VkDeviceSize alignment = 0;
VkDeviceSize size = 0;
Expand Down Expand Up @@ -72,7 +71,7 @@ class BufferPool
{
public:
~BufferPool();
void init(Device *device, VkDeviceSize block_size, VkDeviceSize alignment, VkBufferUsageFlags usage, bool need_device_local);
void init(Device *device, VkDeviceSize block_size, VkDeviceSize alignment, VkBufferUsageFlags usage);
void reset();

// Used for allocating UBOs, where we want to specify a fixed size for range,
Expand All @@ -97,6 +96,5 @@ class BufferPool
size_t max_retained_blocks = 0;
std::vector<BufferBlock> blocks;
BufferBlock allocate_block(VkDeviceSize size);
bool need_device_local = false;
};
}
10 changes: 5 additions & 5 deletions vulkan/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2181,7 +2181,7 @@ void *CommandBuffer::allocate_constant_data(unsigned set, unsigned binding, VkDe
device->request_uniform_block(ubo_block, size);
data = ubo_block.allocate(size);
}
set_uniform_buffer(set, binding, *ubo_block.gpu, data.offset, data.padded_size);
set_uniform_buffer(set, binding, *ubo_block.buffer, data.offset, data.padded_size);
return data.host;
}

Expand All @@ -2193,7 +2193,7 @@ void *CommandBuffer::allocate_index_data(VkDeviceSize size, VkIndexType index_ty
device->request_index_block(ibo_block, size);
data = ibo_block.allocate(size);
}
set_index_buffer(*ibo_block.gpu, data.offset, index_type);
set_index_buffer(*ibo_block.buffer, data.offset, index_type);
return data.host;
}

Expand All @@ -2208,7 +2208,7 @@ void *CommandBuffer::update_buffer(const Buffer &buffer, VkDeviceSize offset, Vk
device->request_staging_block(staging_block, size);
data = staging_block.allocate(size);
}
copy_buffer(buffer, offset, *staging_block.cpu, data.offset, size);
copy_buffer(buffer, offset, *staging_block.buffer, data.offset, size);
return data.host;
}

Expand Down Expand Up @@ -2248,7 +2248,7 @@ void *CommandBuffer::update_image(const Image &image, const VkOffset3D &offset,
data = staging_block.allocate(size);
}

copy_buffer_to_image(image, *staging_block.cpu, data.offset, offset, extent, row_length, image_height, subresource);
copy_buffer_to_image(image, *staging_block.buffer, data.offset, offset, extent, row_length, image_height, subresource);
return data.host;
}

Expand All @@ -2271,7 +2271,7 @@ void *CommandBuffer::allocate_vertex_data(unsigned binding, VkDeviceSize size, V
data = vbo_block.allocate(size);
}

set_vertex_binding(binding, *vbo_block.gpu, data.offset, stride, step_rate);
set_vertex_binding(binding, *vbo_block.buffer, data.offset, stride, step_rate);
return data.host;
}

Expand Down
85 changes: 15 additions & 70 deletions vulkan/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -945,17 +945,13 @@ void Device::set_context(const Context &context)
managers.semaphore.init(this);
managers.fence.init(this);
managers.event.init(this);
managers.vbo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
ImplementationQuirks::get().staging_need_device_local);
managers.ibo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
ImplementationQuirks::get().staging_need_device_local);
managers.vbo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
managers.ibo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
managers.ubo.init(this, 256 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.minUniformBufferOffsetAlignment),
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
ImplementationQuirks::get().staging_need_device_local);
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
managers.ubo.set_spill_region_size(VULKAN_MAX_UBO_SIZE);
managers.staging.init(this, 64 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.optimalBufferCopyOffsetAlignment),
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
false);
VK_BUFFER_USAGE_TRANSFER_SRC_BIT);

managers.vbo.set_max_retained_blocks(256);
managers.ibo.set_max_retained_blocks(256);
Expand Down Expand Up @@ -1145,10 +1141,13 @@ void Device::init_stock_samplers()
}

static void request_block(Device &device, BufferBlock &block, VkDeviceSize size,
BufferPool &pool, std::vector<BufferBlock> *dma, std::vector<BufferBlock> &recycle)
BufferPool &pool, std::vector<BufferBlock> &recycle)
{
if (block.mapped)
device.unmap_host_buffer(*block.cpu, MEMORY_ACCESS_WRITE_BIT);
{
device.unmap_host_buffer(*block.buffer, MEMORY_ACCESS_WRITE_BIT);
block.mapped = nullptr;
}

if (block.offset == 0)
{
Expand All @@ -1157,12 +1156,6 @@ static void request_block(Device &device, BufferBlock &block, VkDeviceSize size,
}
else
{
if (block.cpu != block.gpu)
{
VK_ASSERT(dma);
dma->push_back(block);
}

if (block.size == pool.get_block_size())
recycle.push_back(block);
}
Expand All @@ -1181,7 +1174,7 @@ void Device::request_vertex_block(BufferBlock &block, VkDeviceSize size)

void Device::request_vertex_block_nolock(BufferBlock &block, VkDeviceSize size)
{
request_block(*this, block, size, managers.vbo, &dma.vbo, frame().vbo_blocks);
request_block(*this, block, size, managers.vbo, frame().vbo_blocks);
}

void Device::request_index_block(BufferBlock &block, VkDeviceSize size)
Expand All @@ -1192,7 +1185,7 @@ void Device::request_index_block(BufferBlock &block, VkDeviceSize size)

void Device::request_index_block_nolock(BufferBlock &block, VkDeviceSize size)
{
request_block(*this, block, size, managers.ibo, &dma.ibo, frame().ibo_blocks);
request_block(*this, block, size, managers.ibo, frame().ibo_blocks);
}

void Device::request_uniform_block(BufferBlock &block, VkDeviceSize size)
Expand All @@ -1203,7 +1196,7 @@ void Device::request_uniform_block(BufferBlock &block, VkDeviceSize size)

void Device::request_uniform_block_nolock(BufferBlock &block, VkDeviceSize size)
{
request_block(*this, block, size, managers.ubo, &dma.ubo, frame().ubo_blocks);
request_block(*this, block, size, managers.ubo, frame().ubo_blocks);
}

void Device::request_staging_block(BufferBlock &block, VkDeviceSize size)
Expand All @@ -1214,7 +1207,7 @@ void Device::request_staging_block(BufferBlock &block, VkDeviceSize size)

void Device::request_staging_block_nolock(BufferBlock &block, VkDeviceSize size)
{
request_block(*this, block, size, managers.staging, nullptr, frame().staging_blocks);
request_block(*this, block, size, managers.staging, frame().staging_blocks);
}

void Device::submit(CommandBufferHandle &cmd, Fence *fence, unsigned semaphore_count, Semaphore *semaphores)
Expand Down Expand Up @@ -1339,9 +1332,6 @@ void Device::submit_empty(CommandBuffer::Type type, Fence *fence, SemaphoreHolde
void Device::submit_empty_nolock(QueueIndices physical_type, Fence *fence,
SemaphoreHolder *semaphore, int profiling_iteration)
{
if (physical_type != QUEUE_INDEX_TRANSFER)
flush_frame(QUEUE_INDEX_TRANSFER);

InternalFence signalled_fence = {};

submit_queue(physical_type, fence ? &signalled_fence : nullptr, semaphore,
Expand Down Expand Up @@ -1717,10 +1707,6 @@ void Device::submit_queue(QueueIndices physical_type, InternalFence *fence,
SemaphoreHolder *external_semaphore,
unsigned semaphore_count, Semaphore *semaphores, int profiling_iteration)
{
// Always check if we need to flush pending transfers.
if (physical_type != QUEUE_INDEX_TRANSFER)
flush_frame(QUEUE_INDEX_TRANSFER);

auto &data = queue_data[physical_type];
auto &submissions = frame().submissions[physical_type];

Expand Down Expand Up @@ -1822,49 +1808,8 @@ void Device::submit_queue(QueueIndices physical_type, InternalFence *fence,

void Device::flush_frame(QueueIndices physical_type)
{
if (queue_info.queues[physical_type] == VK_NULL_HANDLE)
return;

if (physical_type == QUEUE_INDEX_TRANSFER)
sync_buffer_blocks();
submit_queue(physical_type, nullptr);
}

void Device::sync_buffer_blocks()
{
if (dma.vbo.empty() && dma.ibo.empty() && dma.ubo.empty())
return;

auto cmd = request_command_buffer_nolock(get_thread_index(), CommandBuffer::Type::AsyncTransfer, false);
cmd->begin_region("buffer-block-sync");

for (auto &block : dma.vbo)
{
VK_ASSERT(block.offset != 0);
cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
}

for (auto &block : dma.ibo)
{
VK_ASSERT(block.offset != 0);
cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
}

for (auto &block : dma.ubo)
{
VK_ASSERT(block.offset != 0);
cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
}

dma.vbo.clear();
dma.ibo.clear();
dma.ubo.clear();

cmd->end_region();

// Do not flush graphics or compute in this context.
// We must be able to inject semaphores into all currently enqueued graphics / compute.
submit_staging(cmd, false);
if (queue_info.queues[physical_type] != VK_NULL_HANDLE)
submit_queue(physical_type, nullptr);
}

void Device::end_frame_context()
Expand Down
9 changes: 0 additions & 9 deletions vulkan/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -691,14 +691,6 @@ class Device
uint64_t value;
};

// Pending buffers which need to be copied from CPU to GPU before submitting graphics or compute work.
struct
{
std::vector<BufferBlock> vbo;
std::vector<BufferBlock> ibo;
std::vector<BufferBlock> ubo;
} dma;

void submit_queue(QueueIndices physical_type, InternalFence *fence,
SemaphoreHolder *external_semaphore = nullptr,
unsigned semaphore_count = 0,
Expand Down Expand Up @@ -753,7 +745,6 @@ class Device
std::function<void ()> queue_lock_callback;
std::function<void ()> queue_unlock_callback;
void flush_frame(QueueIndices physical_type);
void sync_buffer_blocks();
void submit_empty_inner(QueueIndices type, InternalFence *fence,
SemaphoreHolder *external_semaphore,
unsigned semaphore_count,
Expand Down
1 change: 0 additions & 1 deletion vulkan/quirks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ struct ImplementationQuirks
bool use_transient_color = true;
bool use_transient_depth_stencil = true;
bool queue_wait_on_submission = false;
bool staging_need_device_local = false;
bool use_async_compute_post = true;
bool render_graph_force_single_queue = false;
bool force_no_subgroups = false;
Expand Down

0 comments on commit cc51654

Please sign in to comment.