diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index b0e8d4e7de..023b5bfadc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -762,8 +762,9 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_array_in, uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t factor) { + if (num_radix_blocks == 0) + return; cudaSetDevice(gpu_index); - int num_blocks = 0, num_threads = 0; int num_entries = (lwe_dimension + 1); getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 96a410408d..fdfab13cdc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -138,7 +138,6 @@ template __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks, Torus *msb_blocks, uint32_t glwe_dimension, - uint32_t lsb_count, uint32_t msb_count, uint32_t num_blocks) { size_t big_lwe_dimension = glwe_dimension * params::degree + 1; size_t big_lwe_id = blockIdx.x; @@ -321,8 +320,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_idx_in, h_lwe_idx_out); - size_t copy_size = total_count * sizeof(Torus); - copy_size = sm_copy_count * sizeof(int32_t); + size_t copy_size = sm_copy_count * sizeof(int32_t); cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size, streams[0], gpu_indexes[0]); cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size, @@ -548,8 +546,7 @@ __host__ void host_integer_mult_radix_kb( fill_radix_from_lsb_msb <<>>(vector_result_sb, vector_result_lsb, vector_result_msb, - glwe_dimension, lsb_vector_block_count, - msb_vector_block_count, num_blocks); + glwe_dimension, num_blocks); check_cuda_error(cudaGetLastError()); int terms_degree[2 * num_blocks * num_blocks]; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index a8b6882394..6d065eb79e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -565,6 +565,8 @@ __host__ void scalar_compare_radix_blocks_kb( int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t num_radix_blocks) { + if (num_radix_blocks == 0) + return; auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh index 2666951a3c..e951d2b2a6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh @@ -49,8 +49,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( Torus *rotated_buffer = mem->tmp_rotated; - auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; - // rotate right all the blocks in radix ciphertext // copy result in new buffer // 256 threads are used in every block @@ -76,6 +74,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( giver_blocks, lwe_array, 1, num_blocks, big_lwe_size); + auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks, giver_blocks, bsks, ksks, num_blocks, lut_bivariate, @@ -100,6 +100,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks, lwe_array, 1, num_blocks, big_lwe_size); + auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks, giver_blocks, bsks, ksks, num_blocks, lut_bivariate, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 2106cd6295..e595781256 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -37,8 +37,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( auto big_lwe_size = big_lwe_dimension + 1; auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - cudaSetDevice(gpu_indexes[0]); - // Extract all bits auto bits = mem->tmp_bits; extract_n_bits(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,