From 10263d1579cabed218c1a3a1e381aec6ab688df2 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Mon, 17 Jun 2024 15:37:18 -0300 Subject: [PATCH] chore(gpu): rewrite the scatter/gather logic and move to scratch functions --- .../tfhe-cuda-backend/cuda/include/integer.h | 30 +++ .../cuda/src/integer/integer.cuh | 105 +++-------- .../cuda/src/integer/multiplication.cuh | 63 +++---- .../cuda/src/utils/helper_multi_gpu.cuh | 173 ++++++++++-------- 4 files changed, 181 insertions(+), 190 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 01c42a6ef4..1f8c75ff03 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -473,6 +473,13 @@ template struct int_radix_lut { Torus *tmp_lwe_before_ks; Torus *tmp_lwe_after_ks; + /// For multi GPU execution we create vectors of pointers for inputs and + /// outputs + std::vector lwe_array_in_vec; + std::vector lwe_after_ks_vec; + std::vector lwe_indexes_in_vec; + std::vector lwe_trivial_indexes_vec; + int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, bool allocate_gpu_memory) { @@ -548,6 +555,16 @@ template struct int_radix_lut { cuda_stream_add_callback(streams[0], gpu_indexes[0], host_free_on_stream_callback, h_lwe_indexes); + /// With multiple GPUs we allocate arrays to be pushed to the vectors and + /// copy data on each GPU then when we gather data to GPU 0 we can copy + /// back to the original indexing + multi_gpu_lwe_init(streams, gpu_indexes, gpu_count, lwe_array_in_vec, + lwe_indexes_in_vec, num_radix_blocks, + params.big_lwe_dimension + 1); + multi_gpu_lwe_init(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, + lwe_trivial_indexes_vec, num_radix_blocks, + params.small_lwe_dimension + 1); + // Keyswitch Torus big_size = (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); @@ -580,6 +597,14 @@ template struct int_radix_lut { tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks; tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks; + /// With multiple GPUs we allocate arrays to be pushed to the vectors and + /// copy data on each GPU then when we gather data to GPU 0 we can copy back + /// to the original indexing + lwe_array_in_vec = base_lut_object->lwe_array_in_vec; + lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec; + lwe_indexes_in_vec = base_lut_object->lwe_indexes_in_vec; + lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec; + mem_reuse = true; // Allocate LUT @@ -701,6 +726,11 @@ template struct int_radix_lut { cuda_synchronize_stream(streams[i], gpu_indexes[i]); } buffer.clear(); + + multi_gpu_lwe_release(streams, gpu_indexes, lwe_array_in_vec); + multi_gpu_lwe_release(streams, gpu_indexes, lwe_after_ks_vec); + multi_gpu_lwe_release(streams, gpu_indexes, lwe_indexes_in_vec); + multi_gpu_lwe_release(streams, gpu_indexes, lwe_trivial_indexes_vec); } } }; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 0ec2a1a9b0..79d669bf42 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -160,52 +160,28 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( /// For multi GPU execution we create vectors of pointers for inputs and /// outputs - std::vector lwe_array_in_vec; - std::vector lwe_after_ks_vec; - std::vector lwe_indexes_in_vec; - std::vector lwe_trivial_indexes_vec; - - /// With multiple GPUs we allocate arrays to be pushed to the vectors and copy - /// data on each GPU then when we gather data to GPU 0 we can copy back to the - /// original indexing - if (gpu_count > 1) { - multi_gpu_scatter(streams, gpu_indexes, gpu_count, lwe_array_in_vec, - lwe_array_in, lwe_indexes_in_vec, - lut->lwe_indexes_in, num_radix_blocks, - big_lwe_dimension + 1); - multi_gpu_scatter(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, - lut->tmp_lwe_after_ks, lwe_trivial_indexes_vec, - lut->lwe_trivial_indexes, num_radix_blocks, - small_lwe_dimension + 1); - } else { - /// GPU 0 retains the original array - lwe_array_in_vec.push_back(lwe_array_in); - lwe_after_ks_vec.push_back(lut->tmp_lwe_after_ks); - lwe_indexes_in_vec.push_back(lut->lwe_indexes_in); - lwe_trivial_indexes_vec.push_back(lut->lwe_trivial_indexes); - } + std::vector lwe_array_in_vec = lut->lwe_array_in_vec; + std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; + std::vector lwe_indexes_in_vec = lut->lwe_indexes_in_vec; + std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; + + /// With multiple GPUs we push to the vectors on each GPU then when we gather + /// data to GPU 0 we can copy back to the original indexing + multi_gpu_lwe_scatter( + streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in, + lut->lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1); /// Apply KS to go from a big LWE dimension to a small LWE dimension execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, lwe_trivial_indexes_vec, lwe_array_in_vec, - lwe_indexes_in_vec, ksks, big_lwe_dimension, + lwe_trivial_indexes_vec, ksks, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, num_radix_blocks, false); - /// Copy data back to GPU 0 and release vecs - if (gpu_count > 1) { - multi_gpu_gather(streams, gpu_indexes, gpu_count, - lut->tmp_lwe_after_ks, lwe_after_ks_vec, - lut->lwe_trivial_indexes, num_radix_blocks, - small_lwe_dimension + 1); - multi_gpu_release(streams, gpu_indexes, lwe_array_in_vec); - multi_gpu_release(streams, gpu_indexes, lwe_after_ks_vec); - multi_gpu_release(streams, gpu_indexes, lwe_indexes_in_vec); - multi_gpu_release(streams, gpu_indexes, lwe_trivial_indexes_vec); - } - lwe_array_in_vec.clear(); - lwe_after_ks_vec.clear(); - lwe_indexes_in_vec.clear(); - lwe_trivial_indexes_vec.clear(); + + /// Copy data back to GPU 0 + multi_gpu_lwe_gather( + streams, gpu_indexes, gpu_count, lut->tmp_lwe_after_ks, lwe_after_ks_vec, + lut->lwe_trivial_indexes, num_radix_blocks, small_lwe_dimension + 1); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -233,7 +209,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( auto params = lut->params; auto pbs_type = params.pbs_type; - uint32_t big_lwe_dimension = params.big_lwe_dimension; + auto big_lwe_dimension = params.big_lwe_dimension; auto small_lwe_dimension = params.small_lwe_dimension; auto ks_level = params.ks_level; auto ks_base_log = params.ks_base_log; @@ -255,49 +231,26 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( /// For multi GPU execution we create vectors of pointers for inputs and /// outputs - std::vector lwe_array_in_vec; - std::vector lwe_after_ks_vec; - std::vector lwe_indexes_in_vec; - std::vector lwe_trivial_indexes_vec; - if (gpu_count > 1) { - multi_gpu_scatter(streams, gpu_indexes, gpu_count, lwe_array_in_vec, - lwe_array_pbs_in, lwe_indexes_in_vec, - lut->lwe_indexes_in, num_radix_blocks, - big_lwe_dimension + 1); - multi_gpu_scatter(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, - lut->tmp_lwe_after_ks, lwe_trivial_indexes_vec, - lut->lwe_trivial_indexes, num_radix_blocks, - small_lwe_dimension + 1); - } else { - /// GPU 0 retains the original array - lwe_array_in_vec.push_back(lwe_array_pbs_in); - lwe_after_ks_vec.push_back(lut->tmp_lwe_after_ks); - lwe_indexes_in_vec.push_back(lut->lwe_indexes_in); - lwe_trivial_indexes_vec.push_back(lut->lwe_trivial_indexes); - } + std::vector lwe_array_in_vec = lut->lwe_array_in_vec; + std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; + std::vector lwe_indexes_in_vec = lut->lwe_indexes_in_vec; + std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; + + multi_gpu_lwe_scatter( + streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in, + lut->lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1); /// Apply KS to go from a big LWE dimension to a small LWE dimension execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, lwe_trivial_indexes_vec, lwe_array_in_vec, - lwe_indexes_in_vec, ksks, big_lwe_dimension, + lwe_trivial_indexes_vec, ksks, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, num_radix_blocks, false); /// Copy data back to GPU 0 and release vecs - if (gpu_count > 1) { - multi_gpu_gather(streams, gpu_indexes, gpu_count, - lut->tmp_lwe_after_ks, lwe_after_ks_vec, - lut->lwe_trivial_indexes, num_radix_blocks, - small_lwe_dimension + 1); - multi_gpu_release(streams, gpu_indexes, lwe_array_in_vec); - multi_gpu_release(streams, gpu_indexes, lwe_after_ks_vec); - multi_gpu_release(streams, gpu_indexes, lwe_indexes_in_vec); - multi_gpu_release(streams, gpu_indexes, lwe_trivial_indexes_vec); - } - lwe_array_in_vec.clear(); - lwe_after_ks_vec.clear(); - lwe_indexes_in_vec.clear(); - lwe_trivial_indexes_vec.clear(); + multi_gpu_lwe_gather( + streams, gpu_indexes, gpu_count, lut->tmp_lwe_after_ks, lwe_after_ks_vec, + lut->lwe_trivial_indexes, num_radix_blocks, small_lwe_dimension + 1); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index fadb2420c9..5bb11a5f2a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -225,11 +225,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( auto message_modulus = mem_ptr->params.message_modulus; auto carry_modulus = mem_ptr->params.carry_modulus; auto num_blocks = num_blocks_in_radix; - auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1; + auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension; + auto big_lwe_size = big_lwe_dimension + 1; auto glwe_dimension = mem_ptr->params.glwe_dimension; auto polynomial_size = mem_ptr->params.polynomial_size; auto lwe_dimension = mem_ptr->params.small_lwe_dimension; - auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension; + auto small_lwe_size = lwe_dimension + 1; if (old_blocks != terms) { cuda_memcpy_async_gpu_to_gpu(old_blocks, terms, @@ -334,52 +335,32 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( /// For multi GPU execution we create vectors of pointers for inputs and /// outputs - std::vector lwe_array_in_vec; - std::vector lwe_after_ks_vec; - std::vector lwe_indexes_in_vec; - std::vector lwe_trivial_indexes_vec; - if (gpu_count > 1) { - multi_gpu_scatter(streams, gpu_indexes, gpu_count, - lwe_array_in_vec, new_blocks, lwe_indexes_in_vec, - lwe_indexes_in, message_count, - big_lwe_dimension + 1); - multi_gpu_scatter(streams, gpu_indexes, gpu_count, - lwe_after_ks_vec, small_lwe_vector, - lwe_trivial_indexes_vec, lwe_indexes_in, - message_count, lwe_dimension + 1); - } else { - /// GPU 0 retains the original array - lwe_array_in_vec.push_back(new_blocks); - lwe_after_ks_vec.push_back(small_lwe_vector); - lwe_indexes_in_vec.push_back(lwe_indexes_in); - lwe_trivial_indexes_vec.push_back(lwe_indexes_in); - } + std::vector new_blocks_vec = luts_message_carry->lwe_array_in_vec; + std::vector small_lwe_vector_vec = + luts_message_carry->lwe_after_ks_vec; + std::vector lwe_indexes_in_vec = + luts_message_carry->lwe_indexes_in_vec; + std::vector lwe_trivial_indexes_vec = + luts_message_carry->lwe_trivial_indexes_vec; + + multi_gpu_lwe_scatter(streams, gpu_indexes, gpu_count, + new_blocks_vec, new_blocks, lwe_indexes_in, + message_count, big_lwe_size); /// Apply KS to go from a big LWE dimension to a small LWE dimension /// After this keyswitch execution, we need to synchronize the streams /// because the keyswitch and PBS do not operate on the same number of /// inputs - execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, - lwe_trivial_indexes_vec, lwe_array_in_vec, - lwe_indexes_in_vec, ksks, - polynomial_size * glwe_dimension, lwe_dimension, - mem_ptr->params.ks_base_log, - mem_ptr->params.ks_level, message_count, true); + execute_keyswitch( + streams, gpu_indexes, gpu_count, small_lwe_vector_vec, + lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, ksks, + big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log, + mem_ptr->params.ks_level, message_count, true); /// Copy data back to GPU 0 and release vecs - if (gpu_count > 1) { - multi_gpu_gather(streams, gpu_indexes, gpu_count, small_lwe_vector, - lwe_after_ks_vec, lwe_indexes_in, message_count, - lwe_dimension + 1); - multi_gpu_release(streams, gpu_indexes, lwe_array_in_vec); - multi_gpu_release(streams, gpu_indexes, lwe_after_ks_vec); - multi_gpu_release(streams, gpu_indexes, lwe_indexes_in_vec); - multi_gpu_release(streams, gpu_indexes, lwe_trivial_indexes_vec); - } - lwe_array_in_vec.clear(); - lwe_after_ks_vec.clear(); - lwe_indexes_in_vec.clear(); - lwe_trivial_indexes_vec.clear(); + multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, + small_lwe_vector, small_lwe_vector_vec, + lwe_indexes_in, message_count, small_lwe_size); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh index 47036e307f..75295e5d0e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh @@ -3,117 +3,144 @@ #include "helper_multi_gpu.h" -/// Load an array residing on one GPU to all active gpus -/// and split the array among them. -/// The indexing logic is given by an index array. +/// Allocates the input/output vector for all devices +/// Initializes also the related indexing and initializes it to the trivial +/// index template -void multi_gpu_scatter(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, std::vector &dest, - Torus *src, std::vector &dest_indexes, - Torus *src_indexes, uint32_t num_inputs, - uint32_t elements_per_input) { - +void multi_gpu_lwe_init(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::vector &dest, + std::vector &dest_indexes, uint32_t num_inputs, + uint32_t elements_per_input) { auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); - auto cpu_indexes = (Torus *)malloc(num_inputs * sizeof(Torus)); - cuda_memcpy_async_to_cpu(cpu_indexes, src_indexes, num_inputs * sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); + auto h_lwe_trivial_indexes = (Torus *)malloc(num_inputs * sizeof(Torus)); + for (int i = 0; i < num_inputs; i++) + h_lwe_trivial_indexes[i] = i; - // TODO move allocation/drop to scratch/cleanup + dest.resize(active_gpu_count); + dest_indexes.resize(active_gpu_count); +#pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); - Torus *array = (Torus *)cuda_malloc_async( + Torus *d_array = (Torus *)cuda_malloc_async( inputs_on_gpu * elements_per_input * sizeof(Torus), streams[i], gpu_indexes[i]); - Torus *index_array = (Torus *)cuda_malloc_async( + Torus *d_index_array = (Torus *)cuda_malloc_async( inputs_on_gpu * sizeof(Torus), streams[i], gpu_indexes[i]); - cuda_synchronize_stream(streams[i], gpu_indexes[i]); - dest.push_back(array); - dest_indexes.push_back(index_array); + + cuda_memcpy_async_to_gpu(d_index_array, h_lwe_trivial_indexes, + inputs_on_gpu * sizeof(Torus), streams[i], + gpu_indexes[i]); + + dest[i] = d_array; + dest_indexes[i] = d_index_array; } -#pragma omp parallel for num_threads(num_inputs) - for (uint j = 0; j < num_inputs; j++) { - int gpu_index = 0; - Torus index_on_gpu = 0; - Torus accumulated_inputs = 0; - for (uint i = 0; i < active_gpu_count; i++) { - int inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); - if (j < accumulated_inputs + inputs_on_gpu) { - gpu_index = i; - index_on_gpu = j - accumulated_inputs; - printf("input j: %d, gpu_index: %d, index on gpu: %d\n", j, gpu_indexes, - index_on_gpu); - } - accumulated_inputs += inputs_on_gpu; + for (uint i = 0; i < active_gpu_count; i++) + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + + free(h_lwe_trivial_indexes); +} +/// Load an array residing on one GPU to all active gpus +/// and split the array among them. +/// The input indexing logic is given by an index array. +/// The output indexing is always the trivial one +template +void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, std::vector &dest, + Torus *src, Torus *d_src_indexes, + uint32_t num_inputs, uint32_t elements_per_input) { + + auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); + + auto h_src_indexes = (Torus *)malloc(num_inputs * sizeof(Torus)); + cuda_memcpy_async_to_cpu(h_src_indexes, d_src_indexes, + num_inputs * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + dest.resize(active_gpu_count); + +#pragma omp parallel for num_threads(active_gpu_count) + for (uint i = 0; i < active_gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto gpu_offset = 0; + for (uint j = 0; j < i; j++) { + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + } + auto src_indexes = h_src_indexes + gpu_offset; + + // TODO Check if we can increase parallelization by adding another omp + // clause here + for (uint j = 0; j < inputs_on_gpu; j++) { + auto d_dest = dest[i] + j * elements_per_input; + auto d_src = src + src_indexes[j] * elements_per_input; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + elements_per_input * sizeof(Torus), + streams[i], gpu_indexes[i]); } - cuda_memcpy_async_gpu_to_gpu(dest[gpu_index] + - index_on_gpu * elements_per_input, - src + cpu_indexes[j] * elements_per_input, - elements_per_input * sizeof(Torus), - streams[gpu_index], gpu_indexes[gpu_index]); - cuda_memset_async(dest_indexes[gpu_index] + index_on_gpu, index_on_gpu, - sizeof(Torus), streams[gpu_index], - gpu_indexes[gpu_index]); } - for (uint i = 0; i < active_gpu_count; i++) { + + for (uint i = 0; i < active_gpu_count; i++) cuda_synchronize_stream(streams[i], gpu_indexes[i]); - } - free(cpu_indexes); + free(h_src_indexes); } /// Copy data from multiple GPUs back to GPU 0 following the indexing given in /// dest_indexes +/// The input indexing should be the trivial one template -void multi_gpu_gather(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *dest, - const std::vector &src, Torus *dest_indexes, - uint32_t num_inputs, uint32_t elements_per_input) { +void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, Torus *dest, + const std::vector &src, + Torus *d_dest_indexes, uint32_t num_inputs, + uint32_t elements_per_input) { auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); - auto dest_cpu_indexes = (Torus *)malloc(num_inputs * sizeof(Torus)); - cuda_memcpy_async_to_cpu(dest_cpu_indexes, dest_indexes, + auto h_dest_indexes = (Torus *)malloc(num_inputs * sizeof(Torus)); + cuda_memcpy_async_to_cpu(h_dest_indexes, d_dest_indexes, num_inputs * sizeof(Torus), streams[0], gpu_indexes[0]); cuda_synchronize_stream(streams[0], gpu_indexes[0]); -#pragma omp parallel for num_threads(num_inputs) - for (uint j = 0; j < num_inputs; j++) { - int gpu_index = 0; - Torus index_on_gpu = 0; - Torus accumulated_inputs = 0; - for (uint i = 0; i < active_gpu_count; i++) { - int inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); - if (j < accumulated_inputs + inputs_on_gpu) { - gpu_index = i; - index_on_gpu = j - accumulated_inputs; - break; - } - accumulated_inputs += inputs_on_gpu; +#pragma omp parallel for num_threads(active_gpu_count) + for (uint i = 0; i < active_gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto gpu_offset = 0; + for (uint j = 0; j < i; j++) { + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + } + auto dest_indexes = h_dest_indexes + gpu_offset; + + // TODO Check if we can increase parallelization by adding another omp + // clause here + for (uint j = 0; j < inputs_on_gpu; j++) { + auto d_dest = dest + dest_indexes[j] * elements_per_input; + auto d_src = src[i] + j * elements_per_input; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + elements_per_input * sizeof(Torus), + streams[i], gpu_indexes[i]); } - cuda_memcpy_async_gpu_to_gpu( - dest + dest_cpu_indexes[j] * elements_per_input, - src[gpu_index] + index_on_gpu * elements_per_input, - elements_per_input * sizeof(Torus), streams[gpu_index], - gpu_indexes[gpu_index]); } - for (uint i = 0; i < active_gpu_count; i++) { + + for (uint i = 0; i < active_gpu_count; i++) cuda_synchronize_stream(streams[i], gpu_indexes[i]); - } - free(dest_cpu_indexes); + free(h_dest_indexes); } template -void multi_gpu_release(cudaStream_t *streams, uint32_t *gpu_indexes, - std::vector &vec) { +void multi_gpu_lwe_release(cudaStream_t *streams, uint32_t *gpu_indexes, + std::vector &vec) { #pragma omp parallel for num_threads(vec.size()) for (uint i = 0; i < vec.size(); i++) { cuda_drop_async(vec[i], streams[i], gpu_indexes[i]); cuda_synchronize_stream(streams[i], gpu_indexes[i]); } + vec.clear(); } #endif