From 1bfa71f81e583defa261972d32ab6f9a2ae94592 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Wed, 26 Jun 2024 15:05:11 +0000 Subject: [PATCH] refactor(gpu): modify apply lut methods to assume inputs and outputs have trivial indexing --- .../tfhe-cuda-backend/cuda/include/integer.h | 2 +- .../cuda/src/integer/integer.cuh | 80 ++++++++----------- .../cuda/src/integer/multiplication.cuh | 19 ++--- .../cuda/src/utils/helper_multi_gpu.cuh | 20 ++--- 4 files changed, 56 insertions(+), 65 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 6d5c1920cb..34ee0440a2 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -498,7 +498,7 @@ template struct int_radix_lut { cudaSetDevice(i); int8_t *gpu_pbs_buffer; auto num_blocks_on_gpu = - get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count); + get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count); execute_scratch_pbs( streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index ccf141c0e7..03f7d0c706 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -155,16 +155,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( auto polynomial_size = params.polynomial_size; auto grouping_factor = params.grouping_factor; - /// Make sure all data that should be on GPU 0 is indeed there - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - /// For multi GPU execution we create vectors of pointers for inputs and /// outputs std::vector lwe_array_in_vec = lut->lwe_array_in_vec; std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; - if (gpu_count == 1) { + + auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + if (active_gpu_count == 1) { execute_keyswitch(streams, gpu_indexes, 1, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], lwe_array_in, lut->lwe_indexes_in, ksks, big_lwe_dimension, @@ -181,21 +180,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( grouping_factor, num_radix_blocks, 1, 0, cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); } else { - auto h_lwe_indexes_in = lut->h_lwe_indexes_in; - auto h_lwe_indexes_out = lut->h_lwe_indexes_out; - cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); + /// Make sure all data that should be on GPU 0 is indeed there cuda_synchronize_stream(streams[0], gpu_indexes[0]); /// With multiple GPUs we push to the vectors on each GPU then when we /// gather data to GPU 0 we can copy back to the original indexing - multi_gpu_lwe_scatter( + multi_gpu_lwe_trivial_scatter( streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in, - h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false); + num_radix_blocks, big_lwe_dimension + 1, false); /// Apply KS to go from a big LWE dimension to a small LWE dimension execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, @@ -215,15 +207,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); /// Copy data back to GPU 0 and release vecs - multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_after_pbs_vec, h_lwe_indexes_out, - num_radix_blocks, big_lwe_dimension + 1, false); - } + multi_gpu_lwe_trivial_gather( + streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec, + num_radix_blocks, big_lwe_dimension + 1, false); - /// Synchronize all GPUs - auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); - for (uint i = 0; i < active_gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } } @@ -254,15 +245,15 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( num_radix_blocks); check_cuda_error(cudaGetLastError()); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - /// For multi GPU execution we create vectors of pointers for inputs and /// outputs std::vector lwe_array_in_vec = lut->lwe_array_in_vec; std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; - if (gpu_count == 1) { + + auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + if (active_gpu_count == 1) { execute_keyswitch(streams, gpu_indexes, 1, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], lwe_array_pbs_in, lut->lwe_indexes_in, ksks, big_lwe_dimension, @@ -279,19 +270,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( grouping_factor, num_radix_blocks, 1, 0, cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); } else { - auto h_lwe_indexes_in = lut->h_lwe_indexes_in; - auto h_lwe_indexes_out = lut->h_lwe_indexes_out; - cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); cuda_synchronize_stream(streams[0], gpu_indexes[0]); - - multi_gpu_lwe_scatter( + // multi_gpu_lwe_scatter( + // streams, gpu_indexes, gpu_count, lwe_array_in_vec, + // lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks, + // big_lwe_dimension + 1, false); + multi_gpu_lwe_trivial_scatter( streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in, - h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false); + num_radix_blocks, big_lwe_dimension + 1, false); /// Apply KS to go from a big LWE dimension to a small LWE dimension execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, @@ -311,15 +297,19 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); /// Copy data back to GPU 0 and release vecs - multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_after_pbs_vec, h_lwe_indexes_out, - num_radix_blocks, big_lwe_dimension + 1, false); - } - - /// Synchronize all GPUs - auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); - for (uint i = 0; i < active_gpu_count; i++) { - cuda_synchronize_stream(streams[i], gpu_indexes[i]); + // multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, + // lwe_array_out, + // lwe_after_pbs_vec, h_lwe_indexes_out, + // num_radix_blocks, big_lwe_dimension + 1, + // false); + multi_gpu_lwe_trivial_gather( + streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec, + num_radix_blocks, big_lwe_dimension + 1, false); + + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index fe6c6a007e..b2a1c79dff 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -331,8 +331,6 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - auto active_gpu_count = get_active_gpu_count(total_count, gpu_count); - /// For multi GPU execution we create vectors of pointers for inputs and /// outputs std::vector new_blocks_vec = luts_message_carry->lwe_array_in_vec; @@ -343,21 +341,24 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( std::vector lwe_trivial_indexes_vec = luts_message_carry->lwe_trivial_indexes_vec; - if (gpu_count == 1) { + // TODO We should be able to run keyswitch on a single GPU and PBS on + // multiple GPUs if needed + auto active_gpu_count = get_active_gpu_count(total_count, gpu_count); + if (active_gpu_count == 1) { /// Apply KS to go from a big LWE dimension to a small LWE dimension /// After this keyswitch execution, we need to synchronize the streams /// because the keyswitch and PBS do not operate on the same number of /// inputs - execute_keyswitch( - streams, gpu_indexes, gpu_count, small_lwe_vector, lwe_indexes_in, - new_blocks, lwe_indexes_in, ksks, polynomial_size * glwe_dimension, - lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, - message_count, true); + execute_keyswitch(streams, gpu_indexes, 1, small_lwe_vector, + lwe_indexes_in, new_blocks, lwe_indexes_in, ksks, + polynomial_size * glwe_dimension, lwe_dimension, + mem_ptr->params.ks_base_log, + mem_ptr->params.ks_level, message_count, true); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension execute_pbs( - streams, gpu_indexes, gpu_count, new_blocks, lwe_indexes_out, + streams, gpu_indexes, 1, new_blocks, lwe_indexes_out, luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec, small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer, glwe_dimension, lwe_dimension, polynomial_size, diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh index 937f4656a1..45df2ff808 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh @@ -55,7 +55,7 @@ void multi_gpu_lwe_init(cudaStream_t *streams, uint32_t *gpu_indexes, dest.resize(active_gpu_count); #pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); Torus *d_array = (Torus *)cuda_malloc_async( inputs_on_gpu * elements_per_input * sizeof(Torus), streams[i], gpu_indexes[i]); @@ -83,12 +83,12 @@ void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes, cuda_synchronize_stream(streams[0], gpu_indexes[0]); dest.resize(active_gpu_count); - // #pragma omp parallel for num_threads(active_gpu_count) +#pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); auto gpu_offset = 0; for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); } auto src_indexes = h_src_indexes + gpu_offset; @@ -127,10 +127,10 @@ void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes, #pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); auto gpu_offset = 0; for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); } auto dest_indexes = h_dest_indexes + gpu_offset; @@ -170,10 +170,10 @@ void multi_gpu_lwe_trivial_scatter(cudaStream_t *streams, uint32_t *gpu_indexes, #pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); auto gpu_offset = 0; for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); } auto d_dest = dest[i]; @@ -206,10 +206,10 @@ void multi_gpu_lwe_trivial_gather(cudaStream_t *streams, uint32_t *gpu_indexes, #pragma omp parallel for num_threads(active_gpu_count) for (uint i = 0; i < active_gpu_count; i++) { - auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count); auto gpu_offset = 0; for (uint j = 0; j < i; j++) { - gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count); } auto d_dest = dest + gpu_offset * elements_per_input;