From 5eaf4fe5e0f508761d72cbe6ae9f36d1128224fd Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Wed, 26 Jun 2024 18:40:38 +0000 Subject: [PATCH] refactor(gpu): add a parameter to enable optimizations in case lwe_indexes_(in/out) is trivial --- .../tfhe-cuda-backend/cuda/include/integer.h | 61 ++++---- .../cuda/src/integer/integer.cuh | 33 ++-- .../cuda/src/integer/multiplication.cuh | 34 ++--- .../cuda/src/utils/helper_multi_gpu.cuh | 141 ++++++------------ 4 files changed, 109 insertions(+), 160 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index be2061e339..f79feb759a 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -463,6 +463,8 @@ template struct int_radix_lut { Torus *lwe_indexes_out; Torus *h_lwe_indexes_in; Torus *h_lwe_indexes_out; + // Enable optimizations if lwe_indexes_(in/out) are trivial + bool using_trivial_lwe_indexes = true; // lwe_trivial_indexes is the intermediary index we need in case // lwe_indexes_in != lwe_indexes_out Torus *lwe_trivial_indexes; @@ -537,22 +539,20 @@ template struct int_radix_lut { h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); - auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); - for (int i = 0; i < num_radix_blocks; i++) - h_lwe_indexes[i] = i; + h_lwe_indexes_in[i] = i; - cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes); + memcpy(h_lwe_indexes_out, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus)); /// With multiple GPUs we allocate arrays to be pushed to the vectors and /// copy data on each GPU then when we gather data to GPU 0 we can copy @@ -641,22 +641,20 @@ template struct int_radix_lut { h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); - auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); - for (int i = 0; i < num_radix_blocks; i++) - h_lwe_indexes[i] = i; + h_lwe_indexes_in[i] = i; - cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, + cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes); + memcpy(h_lwe_indexes_out, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus)); } // Return a pointer to idx-ith lut at gpu_index's global memory @@ -674,6 +672,22 @@ template struct int_radix_lut { return &lut_indexes[ind]; } + // If this function is called we assume the lwe_indexes_(in/out) are not the + // trivial anymore and thus we disable optimizations + void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index, + Torus *h_indexes_in, Torus *h_indexes_out) { + + memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus)); + memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus)); + + cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in, + num_blocks * sizeof(Torus), stream, gpu_index); + cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out, + num_blocks * sizeof(Torus), stream, gpu_index); + + using_trivial_lwe_indexes = false; + } + // Broadcast luts from gpu src_gpu_idx to all active gpus void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t src_gpu_idx) { @@ -805,12 +819,6 @@ template struct int_bit_extract_luts_buffer { for (int i = 0; i < bits_per_block; i++) h_lwe_indexes_in[i + j * bits_per_block] = j; } - cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in, - num_radix_blocks * bits_per_block * - sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes_in); /** * the output should aim different lwe ciphertexts, so lwe_indexes_out = @@ -822,10 +830,11 @@ template struct int_bit_extract_luts_buffer { for (int i = 0; i < num_radix_blocks * bits_per_block; i++) h_lwe_indexes_out[i] = i; - cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out, - num_radix_blocks * bits_per_block * - sizeof(Torus), - streams[0], gpu_indexes[0]); + lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in, + h_lwe_indexes_out); + + cuda_stream_add_callback(streams[0], gpu_indexes[0], + host_free_on_stream_callback, h_lwe_indexes_in); cuda_stream_add_callback(streams[0], gpu_indexes[0], host_free_on_stream_callback, h_lwe_indexes_out); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 9856a19d18..3f28fdf78b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -185,9 +185,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( /// With multiple GPUs we push to the vectors on each GPU then when we /// gather data to GPU 0 we can copy back to the original indexing - multi_gpu_lwe_trivial_scatter( + multi_gpu_lwe_scatter( streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in, - num_radix_blocks, big_lwe_dimension + 1, false); + lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks, + big_lwe_dimension + 1, false); /// Apply KS to go from a big LWE dimension to a small LWE dimension execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, @@ -207,9 +208,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); /// Copy data back to GPU 0 and release vecs - multi_gpu_lwe_trivial_gather( - streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec, - num_radix_blocks, big_lwe_dimension + 1, false); + multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, + lwe_after_pbs_vec, lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, + num_radix_blocks, big_lwe_dimension + 1, false); /// Synchronize all GPUs for (uint i = 0; i < active_gpu_count; i++) { @@ -271,13 +273,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); - // multi_gpu_lwe_scatter( - // streams, gpu_indexes, gpu_count, lwe_array_in_vec, - // lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks, - // big_lwe_dimension + 1, false); - multi_gpu_lwe_trivial_scatter( + multi_gpu_lwe_scatter( streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in, - num_radix_blocks, big_lwe_dimension + 1, false); + lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks, + big_lwe_dimension + 1, false); /// Apply KS to go from a big LWE dimension to a small LWE dimension execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, @@ -297,14 +296,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); /// Copy data back to GPU 0 and release vecs - // multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, - // lwe_array_out, - // lwe_after_pbs_vec, h_lwe_indexes_out, - // num_radix_blocks, big_lwe_dimension + 1, - // false); - multi_gpu_lwe_trivial_gather( - streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec, - num_radix_blocks, big_lwe_dimension + 1, false); + multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, + lwe_after_pbs_vec, lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, + num_radix_blocks, big_lwe_dimension + 1, false); /// Synchronize all GPUs for (uint i = 0; i < active_gpu_count; i++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index c0b0690692..10a2674d0d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -366,19 +366,13 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, mem_ptr->params.pbs_type, true); } else { - auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in; - auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out; - cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in, - total_count * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out, - total_count * sizeof(Torus), streams[0], - gpu_indexes[0]); cuda_synchronize_stream(streams[0], gpu_indexes[0]); - multi_gpu_lwe_scatter(streams, gpu_indexes, gpu_count, - new_blocks_vec, new_blocks, h_lwe_indexes_in, - message_count, big_lwe_size, false); + multi_gpu_lwe_scatter( + streams, gpu_indexes, gpu_count, new_blocks_vec, new_blocks, + luts_message_carry->h_lwe_indexes_in, + luts_message_carry->using_trivial_lwe_indexes, message_count, + big_lwe_size, false); /// Apply KS to go from a big LWE dimension to a small LWE dimension /// After this keyswitch execution, we need to synchronize the streams @@ -394,13 +388,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( /// different configuration multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, small_lwe_vector, small_lwe_vector_vec, - h_lwe_indexes_in, message_count, - small_lwe_size); + luts_message_carry->h_lwe_indexes_in, + luts_message_carry->using_trivial_lwe_indexes, + message_count, small_lwe_size); - multi_gpu_lwe_scatter(streams, gpu_indexes, gpu_count, - small_lwe_vector_vec, small_lwe_vector, - h_lwe_indexes_in, total_count, - small_lwe_size, false); + multi_gpu_lwe_scatter( + streams, gpu_indexes, gpu_count, small_lwe_vector_vec, + small_lwe_vector, luts_message_carry->h_lwe_indexes_in, + luts_message_carry->using_trivial_lwe_indexes, total_count, + small_lwe_size, false); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -415,7 +411,9 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( mem_ptr->params.pbs_type, false); multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, new_blocks, - lwe_after_pbs_vec, h_lwe_indexes_out, + lwe_after_pbs_vec, + luts_message_carry->h_lwe_indexes_out, + luts_message_carry->using_trivial_lwe_indexes, total_count, big_lwe_size); } diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh index 45df2ff808..62713c69eb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh @@ -74,7 +74,8 @@ template void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, std::vector &dest, Torus *src, Torus *h_src_indexes, - uint32_t num_inputs, uint32_t elements_per_input, + bool is_trivial_index, uint32_t num_inputs, + uint32_t elements_per_input, bool sync_threads = true) { auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); @@ -90,17 +91,27 @@ void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes, for (uint j = 0; j < i; j++) { gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); } - auto src_indexes = h_src_indexes + gpu_offset; - // TODO Check if we can increase parallelization by adding another omp - // clause here - for (uint j = 0; j < inputs_on_gpu; j++) { - auto d_dest = dest[i] + j * elements_per_input; - auto d_src = src + src_indexes[j] * elements_per_input; - - cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, - elements_per_input * sizeof(Torus), - streams[i], gpu_indexes[i]); + if (is_trivial_index) { + auto d_dest = dest[i]; + auto d_src = src + gpu_offset * elements_per_input; + cuda_memcpy_async_gpu_to_gpu( + d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus), + streams[i], gpu_indexes[i]); + + } else { + auto src_indexes = h_src_indexes + gpu_offset; + + // TODO Check if we can increase parallelization by adding another omp + // clause here + for (uint j = 0; j < inputs_on_gpu; j++) { + auto d_dest = dest[i] + j * elements_per_input; + auto d_src = src + src_indexes[j] * elements_per_input; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + elements_per_input * sizeof(Torus), + streams[i], gpu_indexes[i]); + } } } @@ -116,8 +127,8 @@ template void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, Torus *dest, const std::vector &src, - Torus *h_dest_indexes, uint32_t num_inputs, - uint32_t elements_per_input, + Torus *h_dest_indexes, bool is_trivial_index, + uint32_t num_inputs, uint32_t elements_per_input, bool sync_threads = true) { auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); @@ -132,17 +143,27 @@ void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes, for (uint j = 0; j < i; j++) { gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); } - auto dest_indexes = h_dest_indexes + gpu_offset; - - // TODO Check if we can increase parallelization by adding another omp - // clause here - for (uint j = 0; j < inputs_on_gpu; j++) { - auto d_dest = dest + dest_indexes[j] * elements_per_input; - auto d_src = src[i] + j * elements_per_input; - cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, - elements_per_input * sizeof(Torus), - streams[i], gpu_indexes[i]); + if (is_trivial_index) { + auto d_dest = dest + gpu_offset * elements_per_input; + auto d_src = src[i]; + + cuda_memcpy_async_gpu_to_gpu( + d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus), + streams[i], gpu_indexes[i]); + } else { + auto dest_indexes = h_dest_indexes + gpu_offset; + + // TODO Check if we can increase parallelization by adding another omp + // clause here + for (uint j = 0; j < inputs_on_gpu; j++) { + auto d_dest = dest + dest_indexes[j] * elements_per_input; + auto d_src = src[i] + j * elements_per_input; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + elements_per_input * sizeof(Torus), + streams[i], gpu_indexes[i]); + } } } @@ -150,80 +171,6 @@ void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes, for (uint i = 0; i < active_gpu_count; i++) cuda_synchronize_stream(streams[i], gpu_indexes[i]); } -/// Load an array residing on one GPU to all active gpus -/// and split the array among them. -/// The input and output indexing is always the trivial one -template -void multi_gpu_lwe_trivial_scatter(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, - std::vector &dest, Torus *src, - uint32_t num_inputs, - uint32_t elements_per_input, - bool sync_threads = true) { - - auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); - - if (sync_threads) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - - dest.resize(active_gpu_count); - -#pragma omp parallel for num_threads(active_gpu_count) - for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); - auto gpu_offset = 0; - for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); - } - - auto d_dest = dest[i]; - auto d_src = src + gpu_offset * elements_per_input; - cuda_memcpy_async_gpu_to_gpu( - d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus), - streams[i], gpu_indexes[i]); - } - - if (sync_threads) - for (uint i = 0; i < active_gpu_count; i++) - cuda_synchronize_stream(streams[i], gpu_indexes[i]); -} - -/// Copy data from multiple GPUs back to GPU 0 following the indexing given in -/// dest_indexes -/// The input indexing should be the trivial one -template -void multi_gpu_lwe_trivial_gather(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *dest, - const std::vector &src, - uint32_t num_inputs, - uint32_t elements_per_input, - bool sync_threads = true) { - - auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count); - - if (sync_threads) - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - -#pragma omp parallel for num_threads(active_gpu_count) - for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); - auto gpu_offset = 0; - for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); - } - - auto d_dest = dest + gpu_offset * elements_per_input; - auto d_src = src[i]; - - cuda_memcpy_async_gpu_to_gpu( - d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus), - streams[i], gpu_indexes[i]); - } - - if (sync_threads) - for (uint i = 0; i < active_gpu_count; i++) - cuda_synchronize_stream(streams[i], gpu_indexes[i]); -} template void multi_gpu_lwe_release(cudaStream_t *streams, uint32_t *gpu_indexes, std::vector &vec,