diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index c03120a069..0bd4030465 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -583,15 +583,15 @@ template <typename Torus> struct int_radix_lut {
       /// With multiple GPUs we allocate arrays to be pushed to the vectors and
       /// copy data on each GPU then when we gather data to GPU 0 we can copy
       /// back to the original indexing
-      multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-                          num_radix_blocks, params.big_lwe_dimension + 1,
-                          false);
-      multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
-                          num_radix_blocks, params.small_lwe_dimension + 1,
-                          false);
-      multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
-                          num_radix_blocks, params.big_lwe_dimension + 1,
-                          false);
+      multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count,
+                          lwe_array_in_vec, num_radix_blocks,
+                          params.big_lwe_dimension + 1, false);
+      multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count,
+                          lwe_after_ks_vec, num_radix_blocks,
+                          params.small_lwe_dimension + 1, false);
+      multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count,
+                          lwe_after_pbs_vec, num_radix_blocks,
+                          params.big_lwe_dimension + 1, false);
       multi_gpu_alloc_array(streams, gpu_indexes, active_gpu_count,
                             lwe_trivial_indexes_vec, num_radix_blocks, false);
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -778,10 +778,10 @@ template <typename Torus> struct int_radix_lut {
       }
       buffer.clear();
 
-      multi_gpu_release_lwe(streams, gpu_indexes, lwe_array_in_vec, false);
-      multi_gpu_release_lwe(streams, gpu_indexes, lwe_after_ks_vec, false);
-      multi_gpu_release_lwe(streams, gpu_indexes, lwe_after_pbs_vec, false);
-      multi_gpu_release_lwe(streams, gpu_indexes, lwe_trivial_indexes_vec);
+      multi_gpu_release(streams, gpu_indexes, lwe_array_in_vec, false);
+      multi_gpu_release(streams, gpu_indexes, lwe_after_ks_vec, false);
+      multi_gpu_release(streams, gpu_indexes, lwe_after_pbs_vec, false);
+      multi_gpu_release(streams, gpu_indexes, lwe_trivial_indexes_vec);
       lwe_array_in_vec.clear();
       lwe_after_ks_vec.clear();
       lwe_after_pbs_vec.clear();
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index 9850fbadda..a8816ff290 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -131,13 +131,10 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
                        const LweArrayVariant<Torus> &lwe_input_indexes,
                        Torus **ksks, uint32_t lwe_dimension_in,
                        uint32_t lwe_dimension_out, uint32_t base_log,
-                       uint32_t level_count, uint32_t num_samples,
-                       bool sync_streams = true) {
+                       uint32_t level_count, uint32_t num_samples) {
 
   /// If the number of radix blocks is lower than the number of GPUs, not all
   /// GPUs will be active and there will be 1 input per GPU
-  if (sync_streams)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
   for (uint i = 0; i < gpu_count; i++) {
     int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
 
@@ -155,11 +152,6 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
         current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
         base_log, level_count, num_samples_on_gpu);
   }
-
-  if (sync_streams)
-    for (uint i = 0; i < gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
 }
 
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index ec8a0a27d8..70fe720654 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -168,17 +168,17 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
                              lwe_trivial_indexes_vec[0], lwe_array_in,
                              lut->lwe_indexes_in, ksks, big_lwe_dimension,
                              small_lwe_dimension, ks_base_log, ks_level,
-                             num_radix_blocks, false);
+                             num_radix_blocks);
 
     /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
     /// dimension to a big LWE dimension
-    execute_pbs<Torus>(
-        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
-        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
-        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
-        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, 1, 0,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+    execute_pbs<Torus>(streams, gpu_indexes, 1, lwe_array_out,
+                       lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
+                       lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
+                       lut->buffer, glwe_dimension, small_lwe_dimension,
+                       polynomial_size, pbs_base_log, pbs_level,
+                       grouping_factor, num_radix_blocks, 1, 0,
+                       cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -188,14 +188,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
     multi_gpu_scatter_lwe<Torus>(
         streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
         lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
-        big_lwe_dimension + 1, false);
+        big_lwe_dimension + 1);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
-    execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
-                             lwe_trivial_indexes_vec, lwe_array_in_vec,
-                             lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
-                             small_lwe_dimension, ks_base_log, ks_level,
-                             num_radix_blocks, false);
+    execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count,
+                             lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                             lwe_array_in_vec, lwe_trivial_indexes_vec, ksks,
+                             big_lwe_dimension, small_lwe_dimension,
+                             ks_base_log, ks_level, num_radix_blocks);
 
     /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
     /// dimension to a big LWE dimension
@@ -205,13 +205,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
         pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, lwe_array_out,
-                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+    multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count,
+                                lwe_array_out, lwe_after_pbs_vec,
+                                lut->h_lwe_indexes_out,
                                 lut->using_trivial_lwe_indexes,
-                                num_radix_blocks, big_lwe_dimension + 1, false);
+                                num_radix_blocks, big_lwe_dimension + 1);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
@@ -260,30 +261,30 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
                              lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
                              lut->lwe_indexes_in, ksks, big_lwe_dimension,
                              small_lwe_dimension, ks_base_log, ks_level,
-                             num_radix_blocks, false);
+                             num_radix_blocks);
 
     /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
     /// dimension to a big LWE dimension
-    execute_pbs<Torus>(
-        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
-        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
-        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
-        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, 1, 0,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+    execute_pbs<Torus>(streams, gpu_indexes, 1, lwe_array_out,
+                       lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
+                       lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
+                       lut->buffer, glwe_dimension, small_lwe_dimension,
+                       polynomial_size, pbs_base_log, pbs_level,
+                       grouping_factor, num_radix_blocks, 1, 0,
+                       cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
   } else {
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     multi_gpu_scatter_lwe<Torus>(
-        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
-        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
-        big_lwe_dimension + 1, false);
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
+        lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        num_radix_blocks, big_lwe_dimension + 1);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
-    execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
-                             lwe_trivial_indexes_vec, lwe_array_in_vec,
-                             lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
-                             small_lwe_dimension, ks_base_log, ks_level,
-                             num_radix_blocks, false);
+    execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count,
+                             lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                             lwe_array_in_vec, lwe_trivial_indexes_vec, ksks,
+                             big_lwe_dimension, small_lwe_dimension,
+                             ks_base_log, ks_level, num_radix_blocks);
 
     /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
     /// dimension to a big LWE dimension
@@ -293,13 +294,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
         pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, lwe_array_out,
-                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+    multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count,
+                                lwe_array_out, lwe_after_pbs_vec,
+                                lut->h_lwe_indexes_out,
                                 lut->using_trivial_lwe_indexes,
-                                num_radix_blocks, big_lwe_dimension + 1, false);
+                                num_radix_blocks, big_lwe_dimension + 1);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
@@ -674,8 +676,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
         streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
         mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
         mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
-        params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1,
-        false);
+        params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
 
     cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
                                  mem_ptr->tmp_small_lwe_vector,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 7e26a76781..3608b95fae 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -359,7 +359,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
                                lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
                                polynomial_size * glwe_dimension,
                                small_lwe_dimension, mem_ptr->params.ks_base_log,
-                               mem_ptr->params.ks_level, message_count, false);
+                               mem_ptr->params.ks_level, message_count);
 
       /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
       /// dimension to a big LWE dimension
@@ -370,7 +370,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
           mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
-          mem_ptr->params.pbs_type, false);
+          mem_ptr->params.pbs_type);
     } else {
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
@@ -378,18 +378,17 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
           luts_message_carry->h_lwe_indexes_in,
           luts_message_carry->using_trivial_lwe_indexes, total_count,
-          big_lwe_size, false);
+          big_lwe_size);
 
       /// Apply KS to go from a big LWE dimension to a small LWE dimension
       /// After this keyswitch execution, we need to synchronize the streams
       /// because the keyswitch and PBS do not operate on the same number of
       /// inputs
-      execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count,
-                               small_lwe_vector_vec, lwe_trivial_indexes_vec,
-                               new_blocks_vec, lwe_trivial_indexes_vec, ksks,
-                               big_lwe_dimension, small_lwe_dimension,
-                               mem_ptr->params.ks_base_log,
-                               mem_ptr->params.ks_level, total_count, false);
+      execute_keyswitch<Torus>(
+          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
+          ksks, big_lwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
 
       /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
       /// dimension to a big LWE dimension
@@ -401,13 +400,17 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
           mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
-          mem_ptr->params.pbs_type, false);
+          mem_ptr->params.pbs_type);
 
-      multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, new_blocks,
-                                  lwe_after_pbs_vec,
+      multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count,
+                                  new_blocks, lwe_after_pbs_vec,
                                   luts_message_carry->h_lwe_indexes_out,
                                   luts_message_carry->using_trivial_lwe_indexes,
                                   total_count, big_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 1; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
     }
 
     int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index 0543e22956..1689019aaa 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -128,9 +128,7 @@ void execute_pbs(
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t grouping_factor,
     uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, PBS_TYPE pbs_type, bool sync_streams = true) {
-  if (sync_streams)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    uint32_t max_shared_memory, PBS_TYPE pbs_type) {
   switch (sizeof(Torus)) {
   case sizeof(uint32_t):
     // 32 bits
@@ -242,11 +240,6 @@ void execute_pbs(
     PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
           "moduli are supported.")
   }
-
-  if (sync_streams)
-    for (uint i = 0; i < gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
index ee6a92322f..01d4856d59 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -24,21 +24,21 @@ void multi_gpu_alloc_array(cudaStream_t *streams, uint32_t *gpu_indexes,
 /// Copy an array residing on one GPU to all active gpus
 template <typename Torus>
 void multi_gpu_copy_array(cudaStream_t *streams, uint32_t *gpu_indexes,
-                          uint32_t active_gpu_count, std::vector<Torus *> &dest,
+                          uint32_t gpu_count, std::vector<Torus *> &dest,
                           Torus *src, uint32_t elements_per_gpu,
                           bool sync_threads = true) {
 
   if (sync_threads)
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
-  dest.resize(active_gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
     cuda_memcpy_async_gpu_to_gpu(dest[i], src, elements_per_gpu * sizeof(Torus),
                                  streams[i], gpu_indexes[i]);
   }
 
   if (sync_threads)
-    for (uint i = 0; i < active_gpu_count; i++)
+    for (uint i = 0; i < gpu_count; i++)
       cuda_synchronize_stream(streams[i], gpu_indexes[i]);
 }
 /// Allocates the input/output vector for all devices
@@ -47,21 +47,18 @@ void multi_gpu_copy_array(cudaStream_t *streams, uint32_t *gpu_indexes,
 template <typename Torus>
 void multi_gpu_alloc_lwe(cudaStream_t *streams, uint32_t *gpu_indexes,
                          uint32_t gpu_count, std::vector<Torus *> &dest,
-                         uint32_t num_inputs, uint32_t elements_per_input,
+                         uint32_t num_inputs, uint32_t lwe_size,
                          bool sync_threads = true) {
-  auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
-
-  dest.resize(active_gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
     Torus *d_array = (Torus *)cuda_malloc_async(
-        inputs_on_gpu * elements_per_input * sizeof(Torus), streams[i],
-        gpu_indexes[i]);
+        inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i]);
     dest[i] = d_array;
   }
 
   if (sync_threads)
-    for (uint i = 0; i < active_gpu_count; i++)
+    for (uint i = 0; i < gpu_count; i++)
       cuda_synchronize_stream(streams[i], gpu_indexes[i]);
 }
 /// Load an array residing on one GPU to all active gpus
@@ -73,48 +70,36 @@ void multi_gpu_scatter_lwe(cudaStream_t *streams, uint32_t *gpu_indexes,
                            uint32_t gpu_count, std::vector<Torus *> &dest,
                            Torus *src, Torus *h_src_indexes,
                            bool is_trivial_index, uint32_t num_inputs,
-                           uint32_t elements_per_input,
-                           bool sync_threads = true) {
+                           uint32_t lwe_size) {
 
-  auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
-
-  if (sync_threads)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-  dest.resize(active_gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
     auto gpu_offset = 0;
     for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
     }
 
     if (is_trivial_index) {
       auto d_dest = dest[i];
-      auto d_src = src + gpu_offset * elements_per_input;
-      cuda_memcpy_async_gpu_to_gpu(
-          d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus),
-          streams[i], gpu_indexes[i]);
+      auto d_src = src + gpu_offset * lwe_size;
+      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                   inputs_on_gpu * lwe_size * sizeof(Torus),
+                                   streams[i], gpu_indexes[i]);
 
     } else {
       auto src_indexes = h_src_indexes + gpu_offset;
 
-      // TODO Check if we can increase parallelization by adding another omp
-      // clause here
       for (uint j = 0; j < inputs_on_gpu; j++) {
-        auto d_dest = dest[i] + j * elements_per_input;
-        auto d_src = src + src_indexes[j] * elements_per_input;
+        auto d_dest = dest[i] + j * lwe_size;
+        auto d_src = src + src_indexes[j] * lwe_size;
 
-        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
-                                     elements_per_input * sizeof(Torus),
+        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus),
                                      streams[i], gpu_indexes[i]);
       }
     }
   }
-
-  if (sync_threads)
-    for (uint i = 0; i < active_gpu_count; i++)
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
 }
 
 /// Copy data from multiple GPUs back to GPU 0 following the indexing given in
@@ -125,56 +110,43 @@ void multi_gpu_gather_lwe(cudaStream_t *streams, uint32_t *gpu_indexes,
                           uint32_t gpu_count, Torus *dest,
                           const std::vector<Torus *> &src,
                           Torus *h_dest_indexes, bool is_trivial_index,
-                          uint32_t num_inputs, uint32_t elements_per_input,
-                          bool sync_threads = true) {
-
-  auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
+                          uint32_t num_inputs, uint32_t lwe_size) {
 
-  if (sync_threads)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-  for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
     auto gpu_offset = 0;
     for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
     }
 
     if (is_trivial_index) {
-      auto d_dest = dest + gpu_offset * elements_per_input;
+      auto d_dest = dest + gpu_offset * lwe_size;
       auto d_src = src[i];
 
-      cuda_memcpy_async_gpu_to_gpu(
-          d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus),
-          streams[i], gpu_indexes[i]);
+      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                   inputs_on_gpu * lwe_size * sizeof(Torus),
+                                   streams[i], gpu_indexes[i]);
     } else {
       auto dest_indexes = h_dest_indexes + gpu_offset;
 
-      // TODO Check if we can increase parallelization by adding another omp
-      // clause here
       for (uint j = 0; j < inputs_on_gpu; j++) {
-        auto d_dest = dest + dest_indexes[j] * elements_per_input;
-        auto d_src = src[i] + j * elements_per_input;
+        auto d_dest = dest + dest_indexes[j] * lwe_size;
+        auto d_src = src[i] + j * lwe_size;
 
-        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
-                                     elements_per_input * sizeof(Torus),
+        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus),
                                      streams[i], gpu_indexes[i]);
       }
     }
   }
-
-  if (sync_threads)
-    for (uint i = 0; i < active_gpu_count; i++)
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
 }
+
 template <typename Torus>
-void multi_gpu_release_lwe(cudaStream_t *streams, uint32_t *gpu_indexes,
-                           std::vector<Torus *> &vec,
-                           bool sync_threads = true) {
+void multi_gpu_release(cudaStream_t *streams, uint32_t *gpu_indexes,
+                       std::vector<Torus *> &vec, bool sync_threads = true) {
 
-  for (uint i = 0; i < vec.size(); i++) {
+  for (uint i = 0; i < vec.size(); i++)
     cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
-  }
+
   if (sync_threads)
     for (uint i = 0; i < vec.size(); i++)
       cuda_synchronize_stream(streams[i], gpu_indexes[i]);