From 46c9f067b11fb211b9bc91ad6573e7e218dc7103 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 18 Jul 2024 17:38:40 +0200 Subject: [PATCH] fix(gpu): small fixes --- .../tfhe-cuda-backend/cuda/include/integer.h | 18 +++++++++++------- .../cuda/src/integer/addition.cuh | 2 +- .../cuda/src/integer/cmux.cuh | 4 +++- .../cuda/src/integer/comparison.cuh | 1 - .../cuda/src/integer/div_rem.cuh | 8 ++++---- .../cuda/src/integer/scalar_comparison.cuh | 9 ++++----- .../cuda/src/integer/scalar_shifts.cuh | 2 +- .../cuda/src/utils/helper_multi_gpu.cu | 1 - 8 files changed, 24 insertions(+), 21 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 3fa42e8382..df4eb39df0 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -1736,7 +1736,7 @@ template struct int_arithmetic_scalar_shift_buffer { int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { - active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + active_gpu_count = get_active_gpu_count(1, gpu_count); // In the arithmetic shift, a PBS has to be applied to the last rotated // block twice: once to shift it, once to compute the padding block to be // copied onto all blocks to the left of the last rotated block @@ -3074,12 +3074,12 @@ template struct int_resolve_signed_overflow_memory { template struct int_signed_overflowing_add_or_sub_memory { int_radix_params params; + uint32_t active_gpu_count; // memory objects for other operations int_sc_prop_memory *scp_mem; int_last_block_inner_propagate_memory *las_block_prop_mem; int_resolve_signed_overflow_memory *resolve_overflow_mem; - // lookupt tables // sub streams cudaStream_t *sub_streams_1; @@ -3118,13 +3118,17 @@ template struct int_signed_overflowing_add_or_sub_memory { int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op, bool allocate_gpu_memory) { this->params = params; + active_gpu_count = get_active_gpu_count(num_blocks, gpu_count); - allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks); + allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count, + num_blocks); // initialize streams - sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < gpu_count; j++) { + sub_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); } @@ -3160,7 +3164,7 @@ template struct int_signed_overflowing_add_or_sub_memory { cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]); // sub streams - for (uint i = 0; i < gpu_count; i++) { + for (uint i = 0; i < active_gpu_count; i++) { cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh index bab675bf51..28b02a57f8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh @@ -131,7 +131,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb( } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index 43fe3b1846..a1057a443b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -76,8 +76,10 @@ __host__ void host_integer_radix_cmux_kb( mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) { cuda_synchronize_stream(true_streams[j], gpu_indexes[j]); + } + for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) { cuda_synchronize_stream(false_streams[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index e1d3e8452f..d56d6e3297 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality( int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, int32_t num_radix_blocks, int_radix_lut *zero_comparison) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index b2ee88c052..f38c591678 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -400,7 +400,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, gpu_count); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); @@ -510,7 +510,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, gpu_indexes, gpu_count); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); @@ -587,7 +587,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); @@ -628,7 +628,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 3aba53cce4..a8b6882394 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -133,7 +133,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } @@ -205,7 +205,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( std::function sign_handler_f, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto glwe_dimension = params.glwe_dimension; @@ -397,7 +396,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( signed_msb_lut->params.message_modulus); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } @@ -465,7 +464,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } @@ -737,7 +736,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb( } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index cfc9a6773c..e612c9ab2f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -245,7 +245,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( } } } - for (uint j = 0; j < gpu_count; j++) { + for (uint j = 0; j < mem->active_gpu_count; j++) { cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu index cc769cfcc2..a6d6cdd540 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu @@ -27,7 +27,6 @@ int cuda_setup_multi_gpu() { num_used_gpus += 1; } } else { - int has_peer_access_to_device_0; for (int i = 1; i < num_gpus; i++) num_used_gpus += 1; }