diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 36a691480f..6d5c1920cb 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -472,7 +472,6 @@ template struct int_radix_lut { // lwe_indexes_in != lwe_indexes_out Torus *lwe_trivial_indexes; Torus *tmp_lwe_before_ks; - Torus *tmp_lwe_after_ks; /// For multi GPU execution we create vectors of pointers for inputs and /// outputs @@ -583,8 +582,6 @@ template struct int_radix_lut { (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); tmp_lwe_before_ks = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); - tmp_lwe_after_ks = - (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]); } } @@ -606,7 +603,6 @@ template struct int_radix_lut { buffer = base_lut_object->buffer; // Keyswitch tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks; - tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks; /// With multiple GPUs we allocate arrays to be pushed to the vectors and /// copy data on each GPU then when we gather data to GPU 0 we can copy back @@ -729,7 +725,6 @@ template struct int_radix_lut { if (!mem_reuse) { cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]); - cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]); cuda_synchronize_stream(streams[0], gpu_indexes[0]); for (int i = 0; i < buffer.size(); i++) { switch (params.pbs_type) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 0234bed598..ccf141c0e7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -164,44 +164,61 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; + if (gpu_count == 1) { + execute_keyswitch(streams, gpu_indexes, 1, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], lwe_array_in, + lut->lwe_indexes_in, ksks, big_lwe_dimension, + small_lwe_dimension, ks_base_log, ks_level, + num_radix_blocks, false); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out, + lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, + small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + } else { + auto h_lwe_indexes_in = lut->h_lwe_indexes_in; + auto h_lwe_indexes_out = lut->h_lwe_indexes_out; + cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); - auto h_lwe_indexes_in = lut->h_lwe_indexes_in; - auto h_lwe_indexes_out = lut->h_lwe_indexes_out; - cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - - /// With multiple GPUs we push to the vectors on each GPU then when we gather - /// data to GPU 0 we can copy back to the original indexing - multi_gpu_lwe_scatter(streams, gpu_indexes, gpu_count, - lwe_array_in_vec, lwe_array_in, h_lwe_indexes_in, - num_radix_blocks, big_lwe_dimension + 1, false); - - /// Apply KS to go from a big LWE dimension to a small LWE dimension - execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, - lwe_trivial_indexes_vec, lwe_array_in_vec, - lwe_trivial_indexes_vec, ksks, big_lwe_dimension, - small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks, false); - - /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE - /// dimension to a big LWE dimension - execute_pbs( - streams, gpu_indexes, gpu_count, lwe_after_pbs_vec, - lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, - lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, - glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); - - /// Copy data back to GPU 0 and release vecs - multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_after_pbs_vec, h_lwe_indexes_out, - num_radix_blocks, big_lwe_dimension + 1, false); + /// With multiple GPUs we push to the vectors on each GPU then when we + /// gather data to GPU 0 we can copy back to the original indexing + multi_gpu_lwe_scatter( + streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in, + h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false); + + /// Apply KS to go from a big LWE dimension to a small LWE dimension + execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, + lwe_trivial_indexes_vec, lwe_array_in_vec, + lwe_trivial_indexes_vec, ksks, big_lwe_dimension, + small_lwe_dimension, ks_base_log, ks_level, + num_radix_blocks, false); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, + pbs_level, grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + + /// Copy data back to GPU 0 and release vecs + multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, + lwe_after_pbs_vec, h_lwe_indexes_out, + num_radix_blocks, big_lwe_dimension + 1, false); + } /// Synchronize all GPUs auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); @@ -245,42 +262,59 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; + if (gpu_count == 1) { + execute_keyswitch(streams, gpu_indexes, 1, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], lwe_array_pbs_in, + lut->lwe_indexes_in, ksks, big_lwe_dimension, + small_lwe_dimension, ks_base_log, ks_level, + num_radix_blocks, false); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out, + lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, + small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, + grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + } else { + auto h_lwe_indexes_in = lut->h_lwe_indexes_in; + auto h_lwe_indexes_out = lut->h_lwe_indexes_out; + cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); - auto h_lwe_indexes_in = lut->h_lwe_indexes_in; - auto h_lwe_indexes_out = lut->h_lwe_indexes_out; - cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out, - num_radix_blocks * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); + multi_gpu_lwe_scatter( + streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in, + h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false); - multi_gpu_lwe_scatter( - streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in, - h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false); - - /// Apply KS to go from a big LWE dimension to a small LWE dimension - execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, - lwe_trivial_indexes_vec, lwe_array_in_vec, - lwe_trivial_indexes_vec, ksks, big_lwe_dimension, - small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks, false); - - /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE - /// dimension to a big LWE dimension - execute_pbs( - streams, gpu_indexes, gpu_count, lwe_after_pbs_vec, - lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, - lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, - glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); - - /// Copy data back to GPU 0 and release vecs - multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_after_pbs_vec, h_lwe_indexes_out, - num_radix_blocks, big_lwe_dimension + 1, false); + /// Apply KS to go from a big LWE dimension to a small LWE dimension + execute_keyswitch(streams, gpu_indexes, gpu_count, lwe_after_ks_vec, + lwe_trivial_indexes_vec, lwe_array_in_vec, + lwe_trivial_indexes_vec, ksks, big_lwe_dimension, + small_lwe_dimension, ks_base_log, ks_level, + num_radix_blocks, false); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, + pbs_level, grouping_factor, num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false); + + /// Copy data back to GPU 0 and release vecs + multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, lwe_array_out, + lwe_after_pbs_vec, h_lwe_indexes_out, + num_radix_blocks, big_lwe_dimension + 1, false); + } /// Synchronize all GPUs auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index f2cdc18ddb..fe6c6a007e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -343,55 +343,80 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( std::vector lwe_trivial_indexes_vec = luts_message_carry->lwe_trivial_indexes_vec; - auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in; - auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out; - cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in, - total_count * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out, - total_count * sizeof(Torus), streams[0], - gpu_indexes[0]); - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - - multi_gpu_lwe_scatter(streams, gpu_indexes, gpu_count, - new_blocks_vec, new_blocks, h_lwe_indexes_in, - message_count, big_lwe_size, false); - - /// Apply KS to go from a big LWE dimension to a small LWE dimension - /// After this keyswitch execution, we need to synchronize the streams - /// because the keyswitch and PBS do not operate on the same number of - /// inputs - execute_keyswitch( - streams, gpu_indexes, gpu_count, small_lwe_vector_vec, - lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, ksks, - big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log, - mem_ptr->params.ks_level, message_count, false); - - /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a - /// different configuration - multi_gpu_lwe_gather( - streams, gpu_indexes, gpu_count, small_lwe_vector, small_lwe_vector_vec, - h_lwe_indexes_in, message_count, small_lwe_size); - - multi_gpu_lwe_scatter( - streams, gpu_indexes, gpu_count, small_lwe_vector_vec, small_lwe_vector, - h_lwe_indexes_in, total_count, small_lwe_size, false); - - /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE - /// dimension to a big LWE dimension - execute_pbs(streams, gpu_indexes, gpu_count, lwe_after_pbs_vec, - lwe_trivial_indexes_vec, luts_message_carry->lut_vec, - luts_message_carry->lut_indexes_vec, - small_lwe_vector_vec, lwe_trivial_indexes_vec, bsks, - luts_message_carry->buffer, glwe_dimension, - lwe_dimension, polynomial_size, - mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, 2, 0, - max_shared_memory, mem_ptr->params.pbs_type, false); - - multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, new_blocks, - lwe_after_pbs_vec, h_lwe_indexes_out, - total_count, big_lwe_size); + if (gpu_count == 1) { + /// Apply KS to go from a big LWE dimension to a small LWE dimension + /// After this keyswitch execution, we need to synchronize the streams + /// because the keyswitch and PBS do not operate on the same number of + /// inputs + execute_keyswitch( + streams, gpu_indexes, gpu_count, small_lwe_vector, lwe_indexes_in, + new_blocks, lwe_indexes_in, ksks, polynomial_size * glwe_dimension, + lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, + message_count, true); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, gpu_count, new_blocks, lwe_indexes_out, + luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec, + small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer, + glwe_dimension, lwe_dimension, polynomial_size, + mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, + mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, + mem_ptr->params.pbs_type, true); + } else { + auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in; + auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out; + cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in, + total_count * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out, + total_count * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + multi_gpu_lwe_scatter(streams, gpu_indexes, gpu_count, + new_blocks_vec, new_blocks, h_lwe_indexes_in, + message_count, big_lwe_size, false); + + /// Apply KS to go from a big LWE dimension to a small LWE dimension + /// After this keyswitch execution, we need to synchronize the streams + /// because the keyswitch and PBS do not operate on the same number of + /// inputs + execute_keyswitch( + streams, gpu_indexes, gpu_count, small_lwe_vector_vec, + lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, + ksks, big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log, + mem_ptr->params.ks_level, message_count, false); + + /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a + /// different configuration + multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, + small_lwe_vector, small_lwe_vector_vec, + h_lwe_indexes_in, message_count, + small_lwe_size); + + multi_gpu_lwe_scatter(streams, gpu_indexes, gpu_count, + small_lwe_vector_vec, small_lwe_vector, + h_lwe_indexes_in, total_count, + small_lwe_size, false); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs( + streams, gpu_indexes, gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, luts_message_carry->lut_vec, + luts_message_carry->lut_indexes_vec, small_lwe_vector_vec, + lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer, + glwe_dimension, lwe_dimension, polynomial_size, + mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, + mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, + mem_ptr->params.pbs_type, false); + + multi_gpu_lwe_gather(streams, gpu_indexes, gpu_count, new_blocks, + lwe_after_pbs_vec, h_lwe_indexes_out, + total_count, big_lwe_size); + } luts_message_carry->release(streams, gpu_indexes, gpu_count);