From d9ca45638f9aaa84fcdb0f97ac5877111e084820 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 19 Nov 2023 09:39:46 +0100 Subject: [PATCH] Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers --- ggml-vulkan.cpp | 170 ++++++++++++++++++++++++++++++------------------ ggml-vulkan.h | 2 +- llama.cpp | 2 +- 3 files changed, 108 insertions(+), 66 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 2eb349cc3cd22..08bb8262370da 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -156,12 +156,15 @@ struct ggml_vk_tensor_extra_gpu { std::vector comp_seqs; std::vector out_seqs; + int d_idx; + size_t tensor_size; vk_buffer * gpu_buffer; }; struct ggml_vk_garbage_collector { std::vector pipelines; + std::vector gpu_tensors; std::vector extras; std::vector tl_semaphores; std::vector semaphores; @@ -189,8 +192,11 @@ vk_pipeline vk_pipeline_rms_norm_f32; static size_t vk_semaphore_idx; static ggml_vk_garbage_collector vk_gc; static std::vector> vk_pinned_memory; -static size_t vk_prealloc_size_d, vk_prealloc_size_qx, vk_prealloc_size_qy, vk_prealloc_size_x, vk_prealloc_size_y; -static vk_buffer vk_prealloc_d, vk_prealloc_qx, vk_prealloc_qy, vk_prealloc_x, vk_prealloc_y; +static size_t vk_prealloc_size_qx, vk_prealloc_size_qy, vk_prealloc_size_x, vk_prealloc_size_y; +static std::vector vk_prealloc_d_sizes; +static std::vector vk_prealloc_d_blocked; +static vk_buffer vk_prealloc_qx, vk_prealloc_qy, vk_prealloc_x, vk_prealloc_y; +static std::vector vk_prealloc_d_buffers; static vk::Fence vk_fence; static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { @@ -1752,12 +1758,7 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr GGML_ASSERT(extra->comp_seqs.empty()); - vk_buffer* d_D; - if (dst->backend == GGML_BACKEND_GPU) { - d_D = (vk_buffer *) dst->data; - } else { - d_D = &vk_prealloc_d; - } + vk_buffer* d_D = &vk_prealloc_d_buffers[extra->d_idx]; vk_buffer* d_X; vk_buffer* d_Y; if (load_x) { @@ -1882,12 +1883,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * GGML_ASSERT(extra->comp_seqs.empty()); - vk_buffer* d_D; - if (dst->backend == GGML_BACKEND_GPU) { - d_D = (vk_buffer *) dst->data; - } else { - d_D = &vk_prealloc_d; - } + vk_buffer* d_D = &vk_prealloc_d_buffers[extra->d_idx]; GGML_ASSERT(d_D->size >= d_sz * ne02 * ne03); vk_buffer* d_Qx; vk_buffer* d_Qy; @@ -2064,12 +2060,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso GGML_ASSERT(extra->comp_seqs.empty()); - vk_buffer* d_D; - if (dst->backend == GGML_BACKEND_GPU) { - d_D = (vk_buffer *) dst->data; - } else { - d_D = &vk_prealloc_d; - } + vk_buffer* d_D = &vk_prealloc_d_buffers[extra->d_idx]; vk_buffer* d_Qx; vk_buffer* d_Qy; vk_buffer* d_Y; @@ -2358,12 +2349,7 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g GGML_ASSERT(extra->comp_seqs.empty()); - vk_buffer* d_D; - if (dst->backend == GGML_BACKEND_GPU) { - d_D = (vk_buffer *) dst->data; - } else { - d_D = &vk_prealloc_d; - } + vk_buffer* d_D = &vk_prealloc_d_buffers[extra->d_idx]; vk_buffer* d_X = nullptr; vk_buffer* d_Y = nullptr; if (transfer_src0) { @@ -2545,12 +2531,25 @@ static void ggml_vk_realign_tensor(ggml_tensor * tensor) { static ggml_vk_tensor_extra_gpu * ggml_vk_preallocate_buffers(uint32_t d_size, uint32_t qx_size, uint32_t qy_size, uint32_t x_size, uint32_t y_size) { ggml_vk_tensor_extra_gpu * extra = new ggml_vk_tensor_extra_gpu; + extra->d_idx = -1; extra->tensor_size = d_size; extra->gpu_buffer = nullptr; // Check if buffer already exists, increase size if required - if (vk_prealloc_size_d < d_size) { - vk_prealloc_size_d = d_size; + for (size_t i = 0; i < vk_prealloc_d_sizes.size(); i++) { + if (!vk_prealloc_d_blocked[i]) { + extra->d_idx = i; + if (vk_prealloc_d_sizes[i] < d_size) { + vk_prealloc_d_sizes[i] = d_size; + } + break; + } + } + if (extra->d_idx == -1) { + vk_prealloc_d_sizes.push_back(d_size); + vk_prealloc_d_blocked.push_back(nullptr); + vk_prealloc_d_buffers.emplace_back(); + extra->d_idx = vk_prealloc_d_buffers.size() - 1; } if (vk_prealloc_size_qx < qx_size) { vk_prealloc_size_qx = qx_size; @@ -2570,14 +2569,26 @@ static ggml_vk_tensor_extra_gpu * ggml_vk_preallocate_buffers(uint32_t d_size, u return extra; } -void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ +static ggml_tensor * ggml_vk_find_last_use(ggml_tensor * node, ggml_cgraph * graph) { + for (int i = graph->n_nodes - 1; i >= 0; i--) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (graph->nodes[i]->src[j] == node) { + return graph->nodes[i]; + } + } + } + + GGML_ASSERT(false); +} + +void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){ #ifdef VK_DEBUG std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif node->extra = nullptr; - const bool src0_gpu = false; // node->src[0] != nullptr && node->src[0]->ne[1] > 32 && node->src[0]->extra != nullptr && node->src[0]->backend == GGML_BACKEND_CPU; - const bool src1_gpu = false; // node->src[1] != nullptr && node->src[1]->ne[1] > 32 && node->src[1]->extra != nullptr && node->src[1]->backend == GGML_BACKEND_CPU; + const bool src0_gpu = node->src[0] != nullptr && node->src[0]->ne[1] > 32 && node->src[0]->extra != nullptr && node->src[0]->backend == GGML_BACKEND_CPU; + const bool src1_gpu = node->src[1] != nullptr && node->src[1]->ne[1] > 32 && node->src[1]->extra != nullptr && node->src[1]->backend == GGML_BACKEND_CPU; const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT || src0_gpu)) @@ -2631,6 +2642,44 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ const uint32_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; const uint32_t d_sz = ggml_vk_align_size(sizeof(float) * d_ne * split_k, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; + // Block buffers for reuse early + switch (node->op) { + case GGML_OP_REPEAT: + case GGML_OP_GET_ROWS: + case GGML_OP_ADD: + case GGML_OP_SCALE: + case GGML_OP_MUL: + case GGML_OP_RMS_NORM: + case GGML_OP_MUL_MAT: + if (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node)) { + return; + } + + // Reuse GPU buffer if previous op is also on GPU + if (src0_gpu) { + src0->backend = GGML_BACKEND_GPU; + ggml_vk_tensor_extra_gpu * src0_extra = (ggml_vk_tensor_extra_gpu *) src0->extra; + + // Replace with data GPU tensor + vk_prealloc_d_blocked[src0_extra->d_idx] = ggml_vk_find_last_use(src0, graph); + + // Handle buffer offset alignment issues in 2nd and 3rd dimensions early by changing stride + ggml_vk_realign_tensor(src0); + } + if (src1_gpu) { + src1->backend = GGML_BACKEND_GPU; + ggml_vk_tensor_extra_gpu * src1_extra = (ggml_vk_tensor_extra_gpu *) src1->extra; + + // Replace with data GPU tensor + vk_prealloc_d_blocked[src1_extra->d_idx] = ggml_vk_find_last_use(src1, graph); + + ggml_vk_realign_tensor(src1); + } + + default: + break; + } + switch (node->op) { case GGML_OP_REPEAT: node->extra = ggml_vk_preallocate_buffers(d_sz, qx_sz, 0, 0, 0); @@ -2645,47 +2694,29 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ node->extra = ggml_vk_preallocate_buffers(d_sz, transfer_src0 ? qx_sz : 0, transfer_src1 ? qy_sz : 0, 0, 0); break; case GGML_OP_MUL_MAT: - if (!any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node)) { - return; - } - node->extra = ggml_vk_preallocate_buffers(d_sz, transfer_src0 ? qx_sz : 0, transfer_src1 ? qy_sz : 0, qx_needs_dequant ? x_sz : 0, qy_needs_dequant ? y_sz : 0); break; default: return; } - // Reuse GPU buffer if previous op is also on GPU - if (src0_gpu) { - src0->backend = GGML_BACKEND_GPU; - ggml_vk_tensor_extra_gpu * src0_extra = (ggml_vk_tensor_extra_gpu *) src0->extra; - - // Replace with data GPU tensor - src0->data = malloc(sizeof(vk_buffer)); - ggml_vk_pool_malloc(src0_extra->tensor_size, (vk_buffer *)src0->data, vk::MemoryPropertyFlagBits::eDeviceLocal); - - // Handle buffer offset alignment issues in 2nd and 3rd dimensions early by changing stride - ggml_vk_realign_tensor(src0); - } - if (src1_gpu) { - src1->backend = GGML_BACKEND_GPU; - ggml_vk_tensor_extra_gpu * src1_extra = (ggml_vk_tensor_extra_gpu *) src1->extra; - - // Replace with data GPU tensor - src1->data = malloc(sizeof(vk_buffer)); - ggml_vk_pool_malloc(src1_extra->tensor_size, (vk_buffer *)src1->data, vk::MemoryPropertyFlagBits::eDeviceLocal); - - ggml_vk_realign_tensor(src1); + // Unblock buffers if they terminate at current node + for (size_t i = 0; i < vk_prealloc_d_blocked.size(); i++) { + if (vk_prealloc_d_blocked[i] == node) { + vk_prealloc_d_blocked[i] = nullptr; + } } } void ggml_vk_preallocate_buffers() { - if (vk_prealloc_d.size < vk_prealloc_size_d) { - // Resize buffer - if (vk_prealloc_d.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_d); + for (size_t i = 0; i < vk_prealloc_d_sizes.size(); i++) { + if (vk_prealloc_d_buffers[i].size < vk_prealloc_d_sizes[i]) { + // Resize buffer + if (vk_prealloc_d_buffers[i].size > 0) { + ggml_vk_destroy_buffer(vk_prealloc_d_buffers[i]); + } + vk_prealloc_d_buffers[i] = ggml_vk_create_buffer(vk_prealloc_d_sizes[i], vk::MemoryPropertyFlagBits::eDeviceLocal); } - vk_prealloc_d = ggml_vk_create_buffer(vk_prealloc_size_d, vk::MemoryPropertyFlagBits::eDeviceLocal); } if (vk_prealloc_qx.size < vk_prealloc_size_qx) { // Resize buffer @@ -2772,6 +2803,15 @@ void ggml_vk_build_graph(ggml_tensor * node){ } return; } + + // Set data to vk_buffer if backend is GPU + // This can't be done earlier cause the buffer may not exist yet + if (node->backend == GGML_BACKEND_GPU) { + node->data = malloc(sizeof(vk_buffer)); + *(vk_buffer*) node->data = vk_prealloc_d_buffers[((ggml_vk_tensor_extra_gpu *) node->extra)->d_idx]; + + vk_gc.gpu_tensors.push_back(node); + } } bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor){ @@ -2869,12 +2909,14 @@ void ggml_vk_graph_cleanup() { vk_gc.tl_semaphores.clear(); for (auto * extra : vk_gc.extras) { - if (extra->gpu_buffer != nullptr) { - ggml_vk_pool_free(*extra->gpu_buffer); - } delete extra; } vk_gc.extras.clear(); + + for (auto * tensor : vk_gc.gpu_tensors) { + delete (vk_buffer *) tensor->data; + } + vk_gc.gpu_tensors.clear(); } #ifdef GGML_VULKAN_CHECK_RESULTS diff --git a/ggml-vulkan.h b/ggml-vulkan.h index 61648ba3e4fb8..a4ed92e5c2f40 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -8,7 +8,7 @@ extern "C" { void ggml_vk_init(void); -void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node); +void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node, struct ggml_cgraph * graph); void ggml_vk_preallocate_buffers(void); void ggml_vk_build_graph(struct ggml_tensor * node); bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); diff --git a/llama.cpp b/llama.cpp index d6ba620c4003c..3fcc2da91cc6f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -674,7 +674,7 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * #ifdef GGML_USE_VULKAN for (int i = 0; i < graph->n_nodes; i++) { - ggml_vk_preallocate_buffers_graph(graph->nodes[i]); + ggml_vk_preallocate_buffers_graph(graph->nodes[i], graph); } ggml_vk_preallocate_buffers();