From 2674d177c98301a2aeedf94d211c571e0144e909 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Fri, 26 Jul 2024 10:51:22 +0200
Subject: [PATCH] fix(gpu): fix scalar rotate and add some checks

---
 backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh    | 3 ++-
 .../tfhe-cuda-backend/cuda/src/integer/multiplication.cuh  | 7 ++-----
 .../cuda/src/integer/scalar_comparison.cuh                 | 2 ++
 .../tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh   | 6 ++++--
 .../cuda/src/integer/shift_and_rotate.cuh                  | 2 --
 5 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index b0e8d4e7de..023b5bfadc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -762,8 +762,9 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
                           Torus *lwe_array_out, Torus *lwe_array_in,
                           uint32_t lwe_dimension, uint32_t num_radix_blocks,
                           uint32_t factor) {
+  if (num_radix_blocks == 0)
+    return;
   cudaSetDevice(gpu_index);
-
   int num_blocks = 0, num_threads = 0;
   int num_entries = (lwe_dimension + 1);
   getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 96a410408d..fdfab13cdc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -138,7 +138,6 @@ template <typename Torus, class params>
 __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
                                         Torus *msb_blocks,
                                         uint32_t glwe_dimension,
-                                        uint32_t lsb_count, uint32_t msb_count,
                                         uint32_t num_blocks) {
   size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
   size_t big_lwe_id = blockIdx.x;
@@ -321,8 +320,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
     luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
                                         h_lwe_idx_in, h_lwe_idx_out);
 
-    size_t copy_size = total_count * sizeof(Torus);
-    copy_size = sm_copy_count * sizeof(int32_t);
+    size_t copy_size = sm_copy_count * sizeof(int32_t);
     cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
                              streams[0], gpu_indexes[0]);
     cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
@@ -548,8 +546,7 @@ __host__ void host_integer_mult_radix_kb(
   fill_radix_from_lsb_msb<Torus, params>
       <<<num_blocks * num_blocks, params::degree / params::opt, 0,
          streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
-                       glwe_dimension, lsb_vector_block_count,
-                       msb_vector_block_count, num_blocks);
+                       glwe_dimension, num_blocks);
   check_cuda_error(cudaGetLastError());
 
   int terms_degree[2 * num_blocks * num_blocks];
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index a8b6882394..6d065eb79e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -565,6 +565,8 @@ __host__ void scalar_compare_radix_blocks_kb(
     int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
     uint32_t num_radix_blocks) {
 
+  if (num_radix_blocks == 0)
+    return;
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto message_modulus = params.message_modulus;
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
index 2666951a3c..e951d2b2a6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -49,8 +49,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
 
   Torus *rotated_buffer = mem->tmp_rotated;
 
-  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
-
   // rotate right all the blocks in radix ciphertext
   // copy result in new buffer
   // 256 threads are used in every block
@@ -76,6 +74,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
                                    giver_blocks, lwe_array, 1, num_blocks,
                                    big_lwe_size);
 
+    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+
     integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
         giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
@@ -100,6 +100,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
     host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
                                   lwe_array, 1, num_blocks, big_lwe_size);
 
+    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+
     integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
         giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
index 2106cd6295..e595781256 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -37,8 +37,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
   auto big_lwe_size = big_lwe_dimension + 1;
   auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
 
-  cudaSetDevice(gpu_indexes[0]);
-
   // Extract all bits
   auto bits = mem->tmp_bits;
   extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,