diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 8a8d33068f63f..11cff1478e84c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include #include "ggml.h" @@ -91,21 +91,6 @@ struct vk_pipeline { }; struct vk_queue { - vk_queue() {} - vk_queue(const vk_queue& b) : queue_family_index(b.queue_family_index), queue(b.queue), pool(b.pool), cmd_buffer_idx(b.cmd_buffer_idx), cmd_buffers(b.cmd_buffers), stage_flags(b.stage_flags) {} - - vk_queue& operator=(const vk_queue& b) { - if (this != &b) { - queue_family_index = b.queue_family_index; - queue = b.queue; - pool = b.pool; - cmd_buffer_idx = b.cmd_buffer_idx; - cmd_buffers = b.cmd_buffers; - stage_flags = b.stage_flags; - } - return *this; - } - uint32_t queue_family_index; vk::Queue queue; vk::CommandPool pool; @@ -113,14 +98,17 @@ struct vk_queue { std::vector cmd_buffers; vk::PipelineStageFlags stage_flags; +}; - std::mutex mutex; +struct vk_semaphore { + vk::Semaphore s; + uint64_t value; }; struct vk_submission { vk::CommandBuffer buffer; - std::vector wait_semaphores; - std::vector signal_semaphores; + std::vector wait_semaphores; + std::vector signal_semaphores; }; typedef std::vector vk_sequence; @@ -382,7 +370,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { return buf; } -static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { +static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { #ifdef VK_DEBUG std::cerr << "ggml_vk_create_submission()" << std::endl; #endif @@ -393,7 +381,7 @@ static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { +static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { #ifdef VK_DEBUG std::cerr << "ggml_vk_create_sequence_1()" << std::endl; #endif @@ -410,6 +398,8 @@ static void ggml_vk_submit(vk_queue& q, std::vector& sequences, vk: std::vector> tl_wait_vals; std::vector> tl_signal_vals; + std::vector> tl_wait_semaphores; + std::vector> tl_signal_semaphores; std::vector tl_submit_infos; std::vector submit_infos; int idx = -1; @@ -422,7 +412,9 @@ static void ggml_vk_submit(vk_queue& q, std::vector& sequences, vk: } // Pre-reserve vectors to prevent reallocation, which invalidates pointers + tl_wait_semaphores.reserve(reserve); tl_wait_vals.reserve(reserve); + tl_signal_semaphores.reserve(reserve); tl_signal_vals.reserve(reserve); tl_submit_infos.reserve(reserve); submit_infos.reserve(reserve); @@ -432,13 +424,21 @@ static void ggml_vk_submit(vk_queue& q, std::vector& sequences, vk: for (const auto& submission : sequence) { stage_flags.push_back({}); idx++; + tl_wait_vals.push_back({}); + tl_wait_semaphores.push_back({}); + tl_signal_vals.push_back({}); + tl_signal_semaphores.push_back({}); for (size_t i = 0; i < submission.wait_semaphores.size(); i++) { stage_flags[idx].push_back(q.stage_flags); + GGML_ASSERT(submission.wait_semaphores[i].value > 0); + tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value); + tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s); + } + for (size_t i = 0; i < submission.signal_semaphores.size(); i++) { + GGML_ASSERT(submission.signal_semaphores[i].value > 0); + tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value); + tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s); } - tl_wait_vals.push_back({}); - tl_wait_vals[idx].assign(submission.wait_semaphores.size(), 1); - tl_signal_vals.push_back({}); - tl_signal_vals[idx].assign(submission.signal_semaphores.size(), 1); tl_submit_infos.push_back({ (uint32_t) submission.wait_semaphores.size(), tl_wait_vals[idx].data(), @@ -449,19 +449,17 @@ static void ggml_vk_submit(vk_queue& q, std::vector& sequences, vk: tl_submit_infos[idx].pNext = nullptr; submit_infos.push_back({ (uint32_t) submission.wait_semaphores.size(), - submission.wait_semaphores.data(), + tl_wait_semaphores[idx].data(), stage_flags[idx].data(), 1, &submission.buffer, (uint32_t) submission.signal_semaphores.size(), - submission.signal_semaphores.data(), -+ &tl_submit_infos[idx], + tl_signal_semaphores[idx].data(), + &tl_submit_infos[idx], }); } } - std::lock_guard guard(q.mutex); - q.queue.submit(submit_infos, fence); sequences.clear(); @@ -1295,14 +1293,14 @@ static void ggml_vk_dispatch_pipeline(vk_submission& s, vk_pipeline& pipeline, s s.buffer.dispatch(wg0, wg1, wg2); } -static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { +static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { s.buffer.end(); s.wait_semaphores = std::move(wait_semaphores); s.signal_semaphores = std::move(signal_semaphores); } -static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr, std::vector* pre_staging = nullptr) { +static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr, std::vector* pre_staging = nullptr) { #ifdef VK_DEBUG std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl; #endif @@ -1422,7 +1420,7 @@ static inline size_t ggml_vk_align_size(size_t width, size_t align) { return CEIL_DIV(width, align) * align; } -static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr, std::vector* pre_staging = nullptr) { +static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr, std::vector* pre_staging = nullptr) { #ifdef VK_DEBUG std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl; #endif @@ -1436,7 +1434,7 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q); } -static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr) { +static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr) { #ifdef VK_DEBUG std::cerr << "ggml_vk_buffer_read_async(" << size << ")" << std::endl; #endif @@ -1546,7 +1544,6 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ nullptr, 1, &cmd_buffer); - std::lock_guard guard(q.mutex); q.queue.submit({ submit_info }, fence); vk::resultCheck(vk_device.device.waitForFences({ fence }, true, uint64_t(-1)), "vk_buffer_read staging waitForFences"); vk_device.device.destroyFence(fence); @@ -1554,7 +1551,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ } } -static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr, std::vector* pre_staging = nullptr) { +static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores, vk_submission* s = nullptr, std::vector* pre_staging = nullptr) { #ifdef VK_DEBUG std::cerr << "ggml_vk_h2d_tensor_2d()" << std::endl; #endif @@ -1679,7 +1676,7 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in return aligned ? &vk_pipeline_matmul_f32_aligned_l : &vk_pipeline_matmul_f32_l; } -static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, int m, int n, int k, int stride_a, int stride_b, int stride_d, int split_k, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { +static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, int m, int n, int k, int stride_a, int stride_b, int stride_d, int split_k, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { #ifdef VK_DEBUG std::cerr << "ggml_vk_matmul(" << m << ", " << n << ", " << k << ")" << std::endl; #endif @@ -1767,7 +1764,7 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13); } - std::vector x_semaphores; + std::vector x_semaphores; if (load_x) { for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -1775,9 +1772,9 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr const uint32_t x_offset = load_x ? x_sz * (i03 * ne02 + i02) : 0; // copy data to device vk::Semaphore s = ggml_vk_create_semaphore(); - x_semaphores.push_back(s); + x_semaphores.push_back({ s, 1 }); // Wait for previous matmul to be done before writing to the input buffers again - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s }, nullptr, &extra->memcpys)); + extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { s, 1 } }, nullptr, &extra->memcpys)); } } } @@ -1791,24 +1788,22 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr const uint32_t y_offset = y_sz * (i13 * ne12 + i12); const uint32_t d_offset = d_sz * (i13 * ne12 + i12); - vk::Semaphore s_x; + vk::Semaphore s = ggml_vk_create_semaphore(); + std::vector semaphores = { { s, 1 } }; + if (load_x) { - s_x = x_semaphores[i03 * ne02 + i02]; + semaphores.push_back(x_semaphores[i03 * ne02 + i02]); } - vk::Semaphore s_y = ggml_vk_create_semaphore(); - std::vector semaphores = { s_y }; - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { s_y }, nullptr, &extra->memcpys)); + extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { { s, 1 } }, nullptr, &extra->memcpys)); // compute - vk::Semaphore s_mm = ggml_vk_create_semaphore(); - - extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { s_mm })); + extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { { s, 2 } })); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { s_mm }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { { s, 2 } }, {})); } } } @@ -1922,7 +1917,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13); } - std::vector x_semaphores; + std::vector x_semaphores; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -1930,17 +1925,17 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * const uint32_t qx_offset = qx_sz * it_idx0; const uint32_t x_offset = x_sz * it_idx0; - vk::Semaphore s_qx; - vk::Semaphore s_x; + // TODO: Find out why one semaphore doesn't work + vk::Semaphore sem = ggml_vk_create_semaphore(); + vk::Semaphore sem2 = ggml_vk_create_semaphore(); if (load_x) { // copy data to device - s_qx = ggml_vk_create_semaphore(); - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s_qx }, nullptr, &extra->memcpys)); + extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, tr0q, {}, { { sem2, 1 } }, nullptr, &extra->memcpys)); } if (qx_needs_dequant) { - s_x = ggml_vk_create_semaphore(); + sem = ggml_vk_create_semaphore(); vk_submission s = ggml_vk_begin_submission(compq); const std::vector pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 }; @@ -1948,16 +1943,18 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * ggml_vk_sync_buffers(s.buffer, { { *d_X, x_offset, x_sz } }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false); ggml_vk_dispatch_pipeline(s, *to_fp16_vk_0, { { *d_Qx, qx_offset, qx_sz }, { *d_X, x_offset, x_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)x_ne, 1, 1}); if (load_x) { - ggml_vk_end_submission(s, { s_qx }, { s_x }); + ggml_vk_end_submission(s, { { sem2, 1 } }, { { sem, 1 } }); } else { - ggml_vk_end_submission(s, {}, { s_x }); + ggml_vk_end_submission(s, {}, { { sem, 1 } }); } extra->comp_seqs.push_back({ s }); - x_semaphores.push_back(s_x); + x_semaphores.push_back({ sem, 1 }); + } else if (load_x) { + x_semaphores.push_back({ sem2, 1 }); } else { - x_semaphores.push_back(s_qx); + x_semaphores.push_back({ sem, 0 }); } } } @@ -1969,35 +1966,32 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * const uint32_t it_idx0 = (i03 * ne02 + i02); const uint32_t it_idx1 = (i13 * ne12 + i12); - const uint32_t qx_offset = load_x ? qx_sz * it_idx0 : 0; const uint32_t qy_offset = load_y ? qy_sz * it_idx1 : 0; const uint32_t x_offset = x_sz * it_idx0; const uint32_t y_offset = y_sz * it_idx1; const uint32_t d_offset = d_sz * it_idx1; - vk::Semaphore s_y; - vk::Semaphore s_q; - - const vk::Semaphore s_mm = ggml_vk_create_semaphore(); + const vk::Semaphore sem = ggml_vk_create_semaphore(); - std::vector mm_semaphores; + std::vector mm_semaphores; if (load_x) { mm_semaphores.push_back(x_semaphores[it_idx0]); } if (load_y) { - s_y = ggml_vk_create_semaphore(); - mm_semaphores.push_back(s_y); - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { s_y }, nullptr, &extra->memcpys)); + // Set semaphore to 1 + extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { { sem, 1 }}, nullptr, &extra->memcpys)); + // Wait for semaphore val 1 + mm_semaphores.push_back({ sem, 1 }); } // compute - extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { s_mm })); + extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { { sem, 2 } })); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { s_mm }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { { sem, 2 } }, {})); } } } @@ -2095,7 +2089,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso vk_submission s = ggml_vk_begin_submission(compq); - vk::Semaphore s_x; + vk_semaphore s_x; if (load_y) { ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, compq, {}, {}, &s, &extra->memcpys); } @@ -2335,8 +2329,8 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g } else { d_D = &vk_prealloc_d; } - vk_buffer* d_X; - vk_buffer* d_Y; + vk_buffer* d_X = nullptr; + vk_buffer* d_Y = nullptr; if (transfer_src0) { d_X = &vk_prealloc_qx; } else { @@ -2361,20 +2355,18 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g const uint32_t y_offset = transfer_src1 ? y_sz * it_idx : 0; const uint32_t d_offset = d_sz * it_idx; - vk::Semaphore s_x; - vk::Semaphore s_y; - vk::Semaphore s_mm = ggml_vk_create_semaphore(); - std::vector transfer_semaphores; + const vk::Semaphore sem = ggml_vk_create_semaphore(); + vk::Semaphore sem_x; + std::vector transfer_semaphores; // copy src0 to device if (transfer_src0) { - s_x = ggml_vk_create_semaphore(); - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s_x }, nullptr, &extra->memcpys)); - transfer_semaphores.push_back(s_x); + sem_x = ggml_vk_create_semaphore(); + extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { sem_x, 1} }, nullptr, &extra->memcpys)); + transfer_semaphores.push_back({ sem_x, 1 }); } if (transfer_src1) { - s_y = ggml_vk_create_semaphore(); - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { s_y }, nullptr, &extra->memcpys)); - transfer_semaphores.push_back(s_y); + extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { { sem, 1 } }, nullptr, &extra->memcpys)); + transfer_semaphores.push_back({ sem, 1 }); } const int64_t i13 = use_src1 ? i03%ne13 : i03; @@ -2390,13 +2382,13 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(*d_X) }, vk_device.compute_queue, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false); ggml_vk_dispatch_pipeline(s, *pipeline, { { *d_X, x_offset, x_sz }, { *d_D, d_offset, d_sz } }, sizeof(vk_op_push_constants), &pc, { (uint32_t)ne00, (uint32_t)ne01, 1}); } - ggml_vk_end_submission(s, std::move(transfer_semaphores), { s_mm }); + ggml_vk_end_submission(s, std::move(transfer_semaphores), { { sem, 2 } }); extra->comp_seqs.push_back({ s }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { s_mm }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { { sem, 2 } }, {})); } } }