diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index c03120a069..0bd4030465 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -583,15 +583,15 @@ template struct int_radix_lut { /// With multiple GPUs we allocate arrays to be pushed to the vectors and /// copy data on each GPU then when we gather data to GPU 0 we can copy /// back to the original indexing - multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, - num_radix_blocks, params.big_lwe_dimension + 1, - false); - multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec, - num_radix_blocks, params.small_lwe_dimension + 1, - false); - multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, - num_radix_blocks, params.big_lwe_dimension + 1, - false); + multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, + lwe_array_in_vec, num_radix_blocks, + params.big_lwe_dimension + 1, false); + multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, num_radix_blocks, + params.small_lwe_dimension + 1, false); + multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, + lwe_after_pbs_vec, num_radix_blocks, + params.big_lwe_dimension + 1, false); multi_gpu_alloc_array(streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec, num_radix_blocks, false); cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -778,10 +778,10 @@ template struct int_radix_lut { } buffer.clear(); - multi_gpu_release_lwe(streams, gpu_indexes, lwe_array_in_vec, false); - multi_gpu_release_lwe(streams, gpu_indexes, lwe_after_ks_vec, false); - multi_gpu_release_lwe(streams, gpu_indexes, lwe_after_pbs_vec, false); - multi_gpu_release_lwe(streams, gpu_indexes, lwe_trivial_indexes_vec); + multi_gpu_release(streams, gpu_indexes, lwe_array_in_vec, false); + multi_gpu_release(streams, gpu_indexes, lwe_after_ks_vec, false); + multi_gpu_release(streams, gpu_indexes, lwe_after_pbs_vec, false); + multi_gpu_release(streams, gpu_indexes, lwe_trivial_indexes_vec); lwe_array_in_vec.clear(); lwe_after_ks_vec.clear(); lwe_after_pbs_vec.clear(); diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index 9850fbadda..a8816ff290 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -131,13 +131,10 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes, const LweArrayVariant &lwe_input_indexes, Torus **ksks, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, - bool sync_streams = true) { + uint32_t level_count, uint32_t num_samples) { /// If the number of radix blocks is lower than the number of GPUs, not all /// GPUs will be active and there will be 1 input per GPU - if (sync_streams) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); for (uint i = 0; i < gpu_count; i++) { int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count); @@ -155,11 +152,6 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes, current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples_on_gpu); } - - if (sync_streams) - for (uint i = 0; i < gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); - } } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index ec8a0a27d8..70fe720654 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -168,17 +168,17 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lwe_trivial_indexes_vec[0], lwe_array_in, lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks, false); + num_radix_blocks); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension - execute_pbs( - streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out, - lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], - lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, - small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + execute_pbs(streams, gpu_indexes, 1, lwe_array_out, + lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, + lut->buffer, glwe_dimension, small_lwe_dimension, + polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); } else { /// Make sure all data that should be on GPU 0 is indeed there cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -188,14 +188,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( multi_gpu_scatter_lwe( streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks, - big_lwe_dimension + 1, false); + big_lwe_dimension + 1); /// Apply KS to go from a big LWE dimension to a small LWE dimension - execute_keyswitch(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec, - lwe_trivial_indexes_vec, lwe_array_in_vec, - lwe_trivial_indexes_vec, ksks, big_lwe_dimension, - small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks, false); + execute_keyswitch(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, lwe_trivial_indexes_vec, + lwe_array_in_vec, lwe_trivial_indexes_vec, ksks, + big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -205,13 +205,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); /// Copy data back to GPU 0 and release vecs - multi_gpu_gather_lwe(streams, gpu_indexes, active_gpu_count, lwe_array_out, - lwe_after_pbs_vec, lut->h_lwe_indexes_out, + multi_gpu_gather_lwe(streams, gpu_indexes, active_gpu_count, + lwe_array_out, lwe_after_pbs_vec, + lut->h_lwe_indexes_out, lut->using_trivial_lwe_indexes, - num_radix_blocks, big_lwe_dimension + 1, false); + num_radix_blocks, big_lwe_dimension + 1); /// Synchronize all GPUs for (uint i = 0; i < active_gpu_count; i++) { @@ -260,30 +261,30 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lwe_trivial_indexes_vec[0], lwe_array_pbs_in, lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks, false); + num_radix_blocks); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension - execute_pbs( - streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out, - lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], - lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, - small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + execute_pbs(streams, gpu_indexes, 1, lwe_array_out, + lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, + lut->buffer, glwe_dimension, small_lwe_dimension, + polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); multi_gpu_scatter_lwe( - streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_pbs_in, - lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks, - big_lwe_dimension + 1, false); + streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, + lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, + num_radix_blocks, big_lwe_dimension + 1); /// Apply KS to go from a big LWE dimension to a small LWE dimension - execute_keyswitch(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec, - lwe_trivial_indexes_vec, lwe_array_in_vec, - lwe_trivial_indexes_vec, ksks, big_lwe_dimension, - small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks, false); + execute_keyswitch(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, lwe_trivial_indexes_vec, + lwe_array_in_vec, lwe_trivial_indexes_vec, ksks, + big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -293,13 +294,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); /// Copy data back to GPU 0 and release vecs - multi_gpu_gather_lwe(streams, gpu_indexes, active_gpu_count, lwe_array_out, - lwe_after_pbs_vec, lut->h_lwe_indexes_out, + multi_gpu_gather_lwe(streams, gpu_indexes, active_gpu_count, + lwe_array_out, lwe_after_pbs_vec, + lut->h_lwe_indexes_out, lut->using_trivial_lwe_indexes, - num_radix_blocks, big_lwe_dimension + 1, false); + num_radix_blocks, big_lwe_dimension + 1); /// Synchronize all GPUs for (uint i = 0; i < active_gpu_count; i++) { @@ -674,8 +676,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector, mem_ptr->lut->lwe_trivial_indexes, cur_input_block, mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension, - params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1, - false); + params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1); cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size], mem_ptr->tmp_small_lwe_vector, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 7e26a76781..3608b95fae 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -359,7 +359,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( lwe_indexes_in, new_blocks, lwe_indexes_in, ksks, polynomial_size * glwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log, - mem_ptr->params.ks_level, message_count, false); + mem_ptr->params.ks_level, message_count); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -370,7 +370,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, - mem_ptr->params.pbs_type, false); + mem_ptr->params.pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -378,18 +378,17 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks, luts_message_carry->h_lwe_indexes_in, luts_message_carry->using_trivial_lwe_indexes, total_count, - big_lwe_size, false); + big_lwe_size); /// Apply KS to go from a big LWE dimension to a small LWE dimension /// After this keyswitch execution, we need to synchronize the streams /// because the keyswitch and PBS do not operate on the same number of /// inputs - execute_keyswitch(streams, gpu_indexes, active_gpu_count, - small_lwe_vector_vec, lwe_trivial_indexes_vec, - new_blocks_vec, lwe_trivial_indexes_vec, ksks, - big_lwe_dimension, small_lwe_dimension, - mem_ptr->params.ks_base_log, - mem_ptr->params.ks_level, total_count, false); + execute_keyswitch( + streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec, + lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, + ksks, big_lwe_dimension, small_lwe_dimension, + mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -401,13 +400,17 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, - mem_ptr->params.pbs_type, false); + mem_ptr->params.pbs_type); - multi_gpu_gather_lwe(streams, gpu_indexes, active_gpu_count, new_blocks, - lwe_after_pbs_vec, + multi_gpu_gather_lwe(streams, gpu_indexes, active_gpu_count, + new_blocks, lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out, luts_message_carry->using_trivial_lwe_indexes, total_count, big_lwe_size); + /// Synchronize all GPUs + for (uint i = 1; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 0543e22956..1689019aaa 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -128,9 +128,7 @@ void execute_pbs( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, PBS_TYPE pbs_type, bool sync_streams = true) { - if (sync_streams) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); + uint32_t max_shared_memory, PBS_TYPE pbs_type) { switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits @@ -242,11 +240,6 @@ void execute_pbs( PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer " "moduli are supported.") } - - if (sync_streams) - for (uint i = 0; i < gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); - } } template diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh index ee6a92322f..01d4856d59 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh @@ -24,21 +24,21 @@ void multi_gpu_alloc_array(cudaStream_t *streams, uint32_t *gpu_indexes, /// Copy an array residing on one GPU to all active gpus template void multi_gpu_copy_array(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t active_gpu_count, std::vector &dest, + uint32_t gpu_count, std::vector &dest, Torus *src, uint32_t elements_per_gpu, bool sync_threads = true) { if (sync_threads) cuda_synchronize_stream(streams[0], gpu_indexes[0]); - dest.resize(active_gpu_count); - for (uint i = 0; i < active_gpu_count; i++) { + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { cuda_memcpy_async_gpu_to_gpu(dest[i], src, elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i]); } if (sync_threads) - for (uint i = 0; i < active_gpu_count; i++) + for (uint i = 0; i < gpu_count; i++) cuda_synchronize_stream(streams[i], gpu_indexes[i]); } /// Allocates the input/output vector for all devices @@ -47,21 +47,18 @@ void multi_gpu_copy_array(cudaStream_t *streams, uint32_t *gpu_indexes, template void multi_gpu_alloc_lwe(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, std::vector &dest, - uint32_t num_inputs, uint32_t elements_per_input, + uint32_t num_inputs, uint32_t lwe_size, bool sync_threads = true) { - auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); - - dest.resize(active_gpu_count); - for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); Torus *d_array = (Torus *)cuda_malloc_async( - inputs_on_gpu * elements_per_input * sizeof(Torus), streams[i], - gpu_indexes[i]); + inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i]); dest[i] = d_array; } if (sync_threads) - for (uint i = 0; i < active_gpu_count; i++) + for (uint i = 0; i < gpu_count; i++) cuda_synchronize_stream(streams[i], gpu_indexes[i]); } /// Load an array residing on one GPU to all active gpus @@ -73,48 +70,36 @@ void multi_gpu_scatter_lwe(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, std::vector &dest, Torus *src, Torus *h_src_indexes, bool is_trivial_index, uint32_t num_inputs, - uint32_t elements_per_input, - bool sync_threads = true) { + uint32_t lwe_size) { - auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); - - if (sync_threads) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - - dest.resize(active_gpu_count); - for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); auto gpu_offset = 0; for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); } if (is_trivial_index) { auto d_dest = dest[i]; - auto d_src = src + gpu_offset * elements_per_input; - cuda_memcpy_async_gpu_to_gpu( - d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus), - streams[i], gpu_indexes[i]); + auto d_src = src + gpu_offset * lwe_size; + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + inputs_on_gpu * lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); } else { auto src_indexes = h_src_indexes + gpu_offset; - // TODO Check if we can increase parallelization by adding another omp - // clause here for (uint j = 0; j < inputs_on_gpu; j++) { - auto d_dest = dest[i] + j * elements_per_input; - auto d_src = src + src_indexes[j] * elements_per_input; + auto d_dest = dest[i] + j * lwe_size; + auto d_src = src + src_indexes[j] * lwe_size; - cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, - elements_per_input * sizeof(Torus), + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i]); } } } - - if (sync_threads) - for (uint i = 0; i < active_gpu_count; i++) - cuda_synchronize_stream(streams[i], gpu_indexes[i]); } /// Copy data from multiple GPUs back to GPU 0 following the indexing given in @@ -125,56 +110,43 @@ void multi_gpu_gather_lwe(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *dest, const std::vector &src, Torus *h_dest_indexes, bool is_trivial_index, - uint32_t num_inputs, uint32_t elements_per_input, - bool sync_threads = true) { - - auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); + uint32_t num_inputs, uint32_t lwe_size) { - if (sync_threads) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - - for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); auto gpu_offset = 0; for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); } if (is_trivial_index) { - auto d_dest = dest + gpu_offset * elements_per_input; + auto d_dest = dest + gpu_offset * lwe_size; auto d_src = src[i]; - cuda_memcpy_async_gpu_to_gpu( - d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus), - streams[i], gpu_indexes[i]); + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + inputs_on_gpu * lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); } else { auto dest_indexes = h_dest_indexes + gpu_offset; - // TODO Check if we can increase parallelization by adding another omp - // clause here for (uint j = 0; j < inputs_on_gpu; j++) { - auto d_dest = dest + dest_indexes[j] * elements_per_input; - auto d_src = src[i] + j * elements_per_input; + auto d_dest = dest + dest_indexes[j] * lwe_size; + auto d_src = src[i] + j * lwe_size; - cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, - elements_per_input * sizeof(Torus), + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i]); } } } - - if (sync_threads) - for (uint i = 0; i < active_gpu_count; i++) - cuda_synchronize_stream(streams[i], gpu_indexes[i]); } + template -void multi_gpu_release_lwe(cudaStream_t *streams, uint32_t *gpu_indexes, - std::vector &vec, - bool sync_threads = true) { +void multi_gpu_release(cudaStream_t *streams, uint32_t *gpu_indexes, + std::vector &vec, bool sync_threads = true) { - for (uint i = 0; i < vec.size(); i++) { + for (uint i = 0; i < vec.size(); i++) cuda_drop_async(vec[i], streams[i], gpu_indexes[i]); - } + if (sync_threads) for (uint i = 0; i < vec.size(); i++) cuda_synchronize_stream(streams[i], gpu_indexes[i]);