From 46c9f067b11fb211b9bc91ad6573e7e218dc7103 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 18 Jul 2024 17:38:40 +0200
Subject: [PATCH] fix(gpu): small fixes

---
 .../tfhe-cuda-backend/cuda/include/integer.h   | 18 +++++++++++-------
 .../cuda/src/integer/addition.cuh              |  2 +-
 .../cuda/src/integer/cmux.cuh                  |  4 +++-
 .../cuda/src/integer/comparison.cuh            |  1 -
 .../cuda/src/integer/div_rem.cuh               |  8 ++++----
 .../cuda/src/integer/scalar_comparison.cuh     |  9 ++++-----
 .../cuda/src/integer/scalar_shifts.cuh         |  2 +-
 .../cuda/src/utils/helper_multi_gpu.cu         |  1 -
 8 files changed, 24 insertions(+), 21 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 3fa42e8382..df4eb39df0 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -1736,7 +1736,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
                                      int_radix_params params,
                                      uint32_t num_radix_blocks,
                                      bool allocate_gpu_memory) {
-    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    active_gpu_count = get_active_gpu_count(1, gpu_count);
     // In the arithmetic shift, a PBS has to be applied to the last rotated
     // block twice: once to shift it, once to compute the padding block to be
     // copied onto all blocks to the left of the last rotated block
@@ -3074,12 +3074,12 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
 
 template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
   int_radix_params params;
+  uint32_t active_gpu_count;
 
   // memory objects for other operations
   int_sc_prop_memory<Torus> *scp_mem;
   int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
   int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
-  // lookupt tables
 
   // sub streams
   cudaStream_t *sub_streams_1;
@@ -3118,13 +3118,17 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
       int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
       bool allocate_gpu_memory) {
     this->params = params;
+    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
 
-    allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
+    allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count,
+                               num_blocks);
 
     // initialize streams
-    sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
       sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
       sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
     }
@@ -3160,7 +3164,7 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
     cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);
 
     // sub streams
-    for (uint i = 0; i < gpu_count; i++) {
+    for (uint i = 0; i < active_gpu_count; i++) {
       cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
       cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
     }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
index bab675bf51..28b02a57f8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -131,7 +131,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
     }
   }
 
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
     cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
index 43fe3b1846..a1057a443b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -76,8 +76,10 @@ __host__ void host_integer_radix_cmux_kb(
                   mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
     }
   }
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
+  }
+  for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
index e1d3e8452f..d56d6e3297 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality(
     int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
     int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto message_modulus = params.message_modulus;
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index b2ee88c052..f38c591678 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -400,7 +400,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                                           gpu_count);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -510,7 +510,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                                                  gpu_indexes, gpu_count);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -587,7 +587,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
         set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -628,7 +628,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
           bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
     }
   }
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
     cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index 3aba53cce4..a8b6882394 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -133,7 +133,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
             mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -205,7 +205,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
     uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto glwe_dimension = params.glwe_dimension;
@@ -397,7 +396,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
             signed_msb_lut->params.message_modulus);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -465,7 +464,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
             mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -737,7 +736,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
     }
   }
 
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
     cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
index cfc9a6773c..e612c9ab2f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -245,7 +245,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
         }
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem->active_gpu_count; j++) {
       cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
     }
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
index cc769cfcc2..a6d6cdd540 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -27,7 +27,6 @@ int cuda_setup_multi_gpu() {
         num_used_gpus += 1;
       }
     } else {
-      int has_peer_access_to_device_0;
       for (int i = 1; i < num_gpus; i++)
         num_used_gpus += 1;
     }