From 10263d1579cabed218c1a3a1e381aec6ab688df2 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Mon, 17 Jun 2024 15:37:18 -0300
Subject: [PATCH] chore(gpu): rewrite the scatter/gather logic and move to
 scratch functions

---
 .../tfhe-cuda-backend/cuda/include/integer.h  |  30 +++
 .../cuda/src/integer/integer.cuh              | 105 +++--------
 .../cuda/src/integer/multiplication.cuh       |  63 +++----
 .../cuda/src/utils/helper_multi_gpu.cuh       | 173 ++++++++++--------
 4 files changed, 181 insertions(+), 190 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 01c42a6ef4..1f8c75ff03 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -473,6 +473,13 @@ template <typename Torus> struct int_radix_lut {
   Torus *tmp_lwe_before_ks;
   Torus *tmp_lwe_after_ks;
 
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec;
+  std::vector<Torus *> lwe_indexes_in_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec;
+
   int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, bool allocate_gpu_memory) {
@@ -548,6 +555,16 @@ template <typename Torus> struct int_radix_lut {
       cuda_stream_add_callback(streams[0], gpu_indexes[0],
                                host_free_on_stream_callback, h_lwe_indexes);
 
+      /// With multiple GPUs we allocate arrays to be pushed to the vectors and
+      /// copy data on each GPU then when we gather data to GPU 0 we can copy
+      /// back to the original indexing
+      multi_gpu_lwe_init(streams, gpu_indexes, gpu_count, lwe_array_in_vec,
+                         lwe_indexes_in_vec, num_radix_blocks,
+                         params.big_lwe_dimension + 1);
+      multi_gpu_lwe_init(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
+                         lwe_trivial_indexes_vec, num_radix_blocks,
+                         params.small_lwe_dimension + 1);
+
       // Keyswitch
       Torus big_size =
           (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
@@ -580,6 +597,14 @@ template <typename Torus> struct int_radix_lut {
     tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
     tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
 
+    /// With multiple GPUs we allocate arrays to be pushed to the vectors and
+    /// copy data on each GPU then when we gather data to GPU 0 we can copy back
+    /// to the original indexing
+    lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
+    lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
+    lwe_indexes_in_vec = base_lut_object->lwe_indexes_in_vec;
+    lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
+
     mem_reuse = true;
 
     // Allocate LUT
@@ -701,6 +726,11 @@ template <typename Torus> struct int_radix_lut {
         cuda_synchronize_stream(streams[i], gpu_indexes[i]);
       }
       buffer.clear();
+
+      multi_gpu_lwe_release(streams, gpu_indexes, lwe_array_in_vec);
+      multi_gpu_lwe_release(streams, gpu_indexes, lwe_after_ks_vec);
+      multi_gpu_lwe_release(streams, gpu_indexes, lwe_indexes_in_vec);
+      multi_gpu_lwe_release(streams, gpu_indexes, lwe_trivial_indexes_vec);
     }
   }
 };
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 0ec2a1a9b0..79d669bf42 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -160,52 +160,28 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
 
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
-  std::vector<Torus *> lwe_array_in_vec;
-  std::vector<Torus *> lwe_after_ks_vec;
-  std::vector<Torus *> lwe_indexes_in_vec;
-  std::vector<Torus *> lwe_trivial_indexes_vec;
-
-  /// With multiple GPUs we allocate arrays to be pushed to the vectors and copy
-  /// data on each GPU then when we gather data to GPU 0 we can copy back to the
-  /// original indexing
-  if (gpu_count > 1) {
-    multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_array_in_vec,
-                             lwe_array_in, lwe_indexes_in_vec,
-                             lut->lwe_indexes_in, num_radix_blocks,
-                             big_lwe_dimension + 1);
-    multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
-                             lut->tmp_lwe_after_ks, lwe_trivial_indexes_vec,
-                             lut->lwe_trivial_indexes, num_radix_blocks,
-                             small_lwe_dimension + 1);
-  } else {
-    /// GPU 0 retains the original array
-    lwe_array_in_vec.push_back(lwe_array_in);
-    lwe_after_ks_vec.push_back(lut->tmp_lwe_after_ks);
-    lwe_indexes_in_vec.push_back(lut->lwe_indexes_in);
-    lwe_trivial_indexes_vec.push_back(lut->lwe_trivial_indexes);
-  }
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_indexes_in_vec = lut->lwe_indexes_in_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+  /// With multiple GPUs we push to the vectors on each GPU then when we gather
+  /// data to GPU 0 we can copy back to the original indexing
+  multi_gpu_lwe_scatter<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
+      lut->lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1);
 
   /// Apply KS to go from a big LWE dimension to a small LWE dimension
   execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
                            lwe_trivial_indexes_vec, lwe_array_in_vec,
-                           lwe_indexes_in_vec, ksks, big_lwe_dimension,
+                           lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
                            small_lwe_dimension, ks_base_log, ks_level,
                            num_radix_blocks, false);
-  /// Copy data back to GPU 0 and release vecs
-  if (gpu_count > 1) {
-    multi_gpu_gather<Torus>(streams, gpu_indexes, gpu_count,
-                            lut->tmp_lwe_after_ks, lwe_after_ks_vec,
-                            lut->lwe_trivial_indexes, num_radix_blocks,
-                            small_lwe_dimension + 1);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_array_in_vec);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_after_ks_vec);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_indexes_in_vec);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_trivial_indexes_vec);
-  }
-  lwe_array_in_vec.clear();
-  lwe_after_ks_vec.clear();
-  lwe_indexes_in_vec.clear();
-  lwe_trivial_indexes_vec.clear();
+
+  /// Copy data back to GPU 0
+  multi_gpu_lwe_gather<Torus>(
+      streams, gpu_indexes, gpu_count, lut->tmp_lwe_after_ks, lwe_after_ks_vec,
+      lut->lwe_trivial_indexes, num_radix_blocks, small_lwe_dimension + 1);
 
   /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
   /// dimension to a big LWE dimension
@@ -233,7 +209,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
 
   auto params = lut->params;
   auto pbs_type = params.pbs_type;
-  uint32_t big_lwe_dimension = params.big_lwe_dimension;
+  auto big_lwe_dimension = params.big_lwe_dimension;
   auto small_lwe_dimension = params.small_lwe_dimension;
   auto ks_level = params.ks_level;
   auto ks_base_log = params.ks_base_log;
@@ -255,49 +231,26 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
 
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
-  std::vector<Torus *> lwe_array_in_vec;
-  std::vector<Torus *> lwe_after_ks_vec;
-  std::vector<Torus *> lwe_indexes_in_vec;
-  std::vector<Torus *> lwe_trivial_indexes_vec;
-  if (gpu_count > 1) {
-    multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_array_in_vec,
-                             lwe_array_pbs_in, lwe_indexes_in_vec,
-                             lut->lwe_indexes_in, num_radix_blocks,
-                             big_lwe_dimension + 1);
-    multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
-                             lut->tmp_lwe_after_ks, lwe_trivial_indexes_vec,
-                             lut->lwe_trivial_indexes, num_radix_blocks,
-                             small_lwe_dimension + 1);
-  } else {
-    /// GPU 0 retains the original array
-    lwe_array_in_vec.push_back(lwe_array_pbs_in);
-    lwe_after_ks_vec.push_back(lut->tmp_lwe_after_ks);
-    lwe_indexes_in_vec.push_back(lut->lwe_indexes_in);
-    lwe_trivial_indexes_vec.push_back(lut->lwe_trivial_indexes);
-  }
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_indexes_in_vec = lut->lwe_indexes_in_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+  multi_gpu_lwe_scatter<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
+      lut->lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1);
 
   /// Apply KS to go from a big LWE dimension to a small LWE dimension
   execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
                            lwe_trivial_indexes_vec, lwe_array_in_vec,
-                           lwe_indexes_in_vec, ksks, big_lwe_dimension,
+                           lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
                            small_lwe_dimension, ks_base_log, ks_level,
                            num_radix_blocks, false);
 
   /// Copy data back to GPU 0 and release vecs
-  if (gpu_count > 1) {
-    multi_gpu_gather<Torus>(streams, gpu_indexes, gpu_count,
-                            lut->tmp_lwe_after_ks, lwe_after_ks_vec,
-                            lut->lwe_trivial_indexes, num_radix_blocks,
-                            small_lwe_dimension + 1);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_array_in_vec);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_after_ks_vec);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_indexes_in_vec);
-    multi_gpu_release<Torus>(streams, gpu_indexes, lwe_trivial_indexes_vec);
-  }
-  lwe_array_in_vec.clear();
-  lwe_after_ks_vec.clear();
-  lwe_indexes_in_vec.clear();
-  lwe_trivial_indexes_vec.clear();
+  multi_gpu_lwe_gather<Torus>(
+      streams, gpu_indexes, gpu_count, lut->tmp_lwe_after_ks, lwe_after_ks_vec,
+      lut->lwe_trivial_indexes, num_radix_blocks, small_lwe_dimension + 1);
 
   /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
   /// dimension to a big LWE dimension
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index fadb2420c9..5bb11a5f2a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -225,11 +225,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
   auto message_modulus = mem_ptr->params.message_modulus;
   auto carry_modulus = mem_ptr->params.carry_modulus;
   auto num_blocks = num_blocks_in_radix;
-  auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
   auto glwe_dimension = mem_ptr->params.glwe_dimension;
   auto polynomial_size = mem_ptr->params.polynomial_size;
   auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto small_lwe_size = lwe_dimension + 1;
 
   if (old_blocks != terms) {
     cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
@@ -334,52 +335,32 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
 
     /// For multi GPU execution we create vectors of pointers for inputs and
     /// outputs
-    std::vector<Torus *> lwe_array_in_vec;
-    std::vector<Torus *> lwe_after_ks_vec;
-    std::vector<Torus *> lwe_indexes_in_vec;
-    std::vector<Torus *> lwe_trivial_indexes_vec;
-    if (gpu_count > 1) {
-      multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                               lwe_array_in_vec, new_blocks, lwe_indexes_in_vec,
-                               lwe_indexes_in, message_count,
-                               big_lwe_dimension + 1);
-      multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                               lwe_after_ks_vec, small_lwe_vector,
-                               lwe_trivial_indexes_vec, lwe_indexes_in,
-                               message_count, lwe_dimension + 1);
-    } else {
-      /// GPU 0 retains the original array
-      lwe_array_in_vec.push_back(new_blocks);
-      lwe_after_ks_vec.push_back(small_lwe_vector);
-      lwe_indexes_in_vec.push_back(lwe_indexes_in);
-      lwe_trivial_indexes_vec.push_back(lwe_indexes_in);
-    }
+    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
+    std::vector<Torus *> small_lwe_vector_vec =
+        luts_message_carry->lwe_after_ks_vec;
+    std::vector<Torus *> lwe_indexes_in_vec =
+        luts_message_carry->lwe_indexes_in_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec =
+        luts_message_carry->lwe_trivial_indexes_vec;
+
+    multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
+                                 new_blocks_vec, new_blocks, lwe_indexes_in,
+                                 message_count, big_lwe_size);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     /// After this keyswitch execution, we need to synchronize the streams
     /// because the keyswitch and PBS do not operate on the same number of
     /// inputs
-    execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
-                             lwe_trivial_indexes_vec, lwe_array_in_vec,
-                             lwe_indexes_in_vec, ksks,
-                             polynomial_size * glwe_dimension, lwe_dimension,
-                             mem_ptr->params.ks_base_log,
-                             mem_ptr->params.ks_level, message_count, true);
+    execute_keyswitch<Torus>(
+        streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+        lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, ksks,
+        big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log,
+        mem_ptr->params.ks_level, message_count, true);
 
     /// Copy data back to GPU 0 and release vecs
-    if (gpu_count > 1) {
-      multi_gpu_gather<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
-                              lwe_after_ks_vec, lwe_indexes_in, message_count,
-                              lwe_dimension + 1);
-      multi_gpu_release<Torus>(streams, gpu_indexes, lwe_array_in_vec);
-      multi_gpu_release<Torus>(streams, gpu_indexes, lwe_after_ks_vec);
-      multi_gpu_release<Torus>(streams, gpu_indexes, lwe_indexes_in_vec);
-      multi_gpu_release<Torus>(streams, gpu_indexes, lwe_trivial_indexes_vec);
-    }
-    lwe_array_in_vec.clear();
-    lwe_after_ks_vec.clear();
-    lwe_indexes_in_vec.clear();
-    lwe_trivial_indexes_vec.clear();
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
+                                small_lwe_vector, small_lwe_vector_vec,
+                                lwe_indexes_in, message_count, small_lwe_size);
 
     /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
     /// dimension to a big LWE dimension
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
index 47036e307f..75295e5d0e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -3,117 +3,144 @@
 
 #include "helper_multi_gpu.h"
 
-/// Load an array residing on one GPU to all active gpus
-/// and split the array among them.
-/// The indexing logic is given by an index array.
+/// Allocates the input/output vector for all devices
+/// Initializes also the related indexing and initializes it to the trivial
+/// index
 template <typename Torus>
-void multi_gpu_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
-                       uint32_t gpu_count, std::vector<Torus *> &dest,
-                       Torus *src, std::vector<Torus *> &dest_indexes,
-                       Torus *src_indexes, uint32_t num_inputs,
-                       uint32_t elements_per_input) {
-
+void multi_gpu_lwe_init(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, std::vector<Torus *> &dest,
+                        std::vector<Torus *> &dest_indexes, uint32_t num_inputs,
+                        uint32_t elements_per_input) {
   auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
 
-  auto cpu_indexes = (Torus *)malloc(num_inputs * sizeof(Torus));
-  cuda_memcpy_async_to_cpu(cpu_indexes, src_indexes, num_inputs * sizeof(Torus),
-                           streams[0], gpu_indexes[0]);
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  auto h_lwe_trivial_indexes = (Torus *)malloc(num_inputs * sizeof(Torus));
+  for (int i = 0; i < num_inputs; i++)
+    h_lwe_trivial_indexes[i] = i;
 
-  // TODO move allocation/drop to scratch/cleanup
+  dest.resize(active_gpu_count);
+  dest_indexes.resize(active_gpu_count);
+#pragma omp parallel for num_threads(active_gpu_count)
   for (uint i = 0; i < active_gpu_count; i++) {
     auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
-    Torus *array = (Torus *)cuda_malloc_async(
+    Torus *d_array = (Torus *)cuda_malloc_async(
         inputs_on_gpu * elements_per_input * sizeof(Torus), streams[i],
         gpu_indexes[i]);
-    Torus *index_array = (Torus *)cuda_malloc_async(
+    Torus *d_index_array = (Torus *)cuda_malloc_async(
         inputs_on_gpu * sizeof(Torus), streams[i], gpu_indexes[i]);
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    dest.push_back(array);
-    dest_indexes.push_back(index_array);
+
+    cuda_memcpy_async_to_gpu(d_index_array, h_lwe_trivial_indexes,
+                             inputs_on_gpu * sizeof(Torus), streams[i],
+                             gpu_indexes[i]);
+
+    dest[i] = d_array;
+    dest_indexes[i] = d_index_array;
   }
 
-#pragma omp parallel for num_threads(num_inputs)
-  for (uint j = 0; j < num_inputs; j++) {
-    int gpu_index = 0;
-    Torus index_on_gpu = 0;
-    Torus accumulated_inputs = 0;
-    for (uint i = 0; i < active_gpu_count; i++) {
-      int inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
-      if (j < accumulated_inputs + inputs_on_gpu) {
-        gpu_index = i;
-        index_on_gpu = j - accumulated_inputs;
-        printf("input j: %d, gpu_index: %d, index on gpu: %d\n", j, gpu_indexes,
-               index_on_gpu);
-      }
-      accumulated_inputs += inputs_on_gpu;
+  for (uint i = 0; i < active_gpu_count; i++)
+    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+
+  free(h_lwe_trivial_indexes);
+}
+/// Load an array residing on one GPU to all active gpus
+/// and split the array among them.
+/// The input indexing logic is given by an index array.
+/// The output indexing is always the trivial one
+template <typename Torus>
+void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
+                           uint32_t gpu_count, std::vector<Torus *> &dest,
+                           Torus *src, Torus *d_src_indexes,
+                           uint32_t num_inputs, uint32_t elements_per_input) {
+
+  auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
+
+  auto h_src_indexes = (Torus *)malloc(num_inputs * sizeof(Torus));
+  cuda_memcpy_async_to_cpu(h_src_indexes, d_src_indexes,
+                           num_inputs * sizeof(Torus), streams[0],
+                           gpu_indexes[0]);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+  dest.resize(active_gpu_count);
+
+#pragma omp parallel for num_threads(active_gpu_count)
+  for (uint i = 0; i < active_gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto gpu_offset = 0;
+    for (uint j = 0; j < i; j++) {
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+    }
+    auto src_indexes = h_src_indexes + gpu_offset;
+
+    // TODO Check if we can increase parallelization by adding another omp
+    // clause here
+    for (uint j = 0; j < inputs_on_gpu; j++) {
+      auto d_dest = dest[i] + j * elements_per_input;
+      auto d_src = src + src_indexes[j] * elements_per_input;
+
+      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                   elements_per_input * sizeof(Torus),
+                                   streams[i], gpu_indexes[i]);
     }
-    cuda_memcpy_async_gpu_to_gpu(dest[gpu_index] +
-                                     index_on_gpu * elements_per_input,
-                                 src + cpu_indexes[j] * elements_per_input,
-                                 elements_per_input * sizeof(Torus),
-                                 streams[gpu_index], gpu_indexes[gpu_index]);
-    cuda_memset_async(dest_indexes[gpu_index] + index_on_gpu, index_on_gpu,
-                      sizeof(Torus), streams[gpu_index],
-                      gpu_indexes[gpu_index]);
   }
-  for (uint i = 0; i < active_gpu_count; i++) {
+
+  for (uint i = 0; i < active_gpu_count; i++)
     cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-  }
-  free(cpu_indexes);
+  free(h_src_indexes);
 }
 
 /// Copy data from multiple GPUs back to GPU 0 following the indexing given in
 /// dest_indexes
+/// The input indexing should be the trivial one
 template <typename Torus>
-void multi_gpu_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
-                      uint32_t gpu_count, Torus *dest,
-                      const std::vector<Torus *> &src, Torus *dest_indexes,
-                      uint32_t num_inputs, uint32_t elements_per_input) {
+void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
+                          uint32_t gpu_count, Torus *dest,
+                          const std::vector<Torus *> &src,
+                          Torus *d_dest_indexes, uint32_t num_inputs,
+                          uint32_t elements_per_input) {
 
   auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
 
-  auto dest_cpu_indexes = (Torus *)malloc(num_inputs * sizeof(Torus));
-  cuda_memcpy_async_to_cpu(dest_cpu_indexes, dest_indexes,
+  auto h_dest_indexes = (Torus *)malloc(num_inputs * sizeof(Torus));
+  cuda_memcpy_async_to_cpu(h_dest_indexes, d_dest_indexes,
                            num_inputs * sizeof(Torus), streams[0],
                            gpu_indexes[0]);
   cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
-#pragma omp parallel for num_threads(num_inputs)
-  for (uint j = 0; j < num_inputs; j++) {
-    int gpu_index = 0;
-    Torus index_on_gpu = 0;
-    Torus accumulated_inputs = 0;
-    for (uint i = 0; i < active_gpu_count; i++) {
-      int inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
-      if (j < accumulated_inputs + inputs_on_gpu) {
-        gpu_index = i;
-        index_on_gpu = j - accumulated_inputs;
-        break;
-      }
-      accumulated_inputs += inputs_on_gpu;
+#pragma omp parallel for num_threads(active_gpu_count)
+  for (uint i = 0; i < active_gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto gpu_offset = 0;
+    for (uint j = 0; j < i; j++) {
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+    }
+    auto dest_indexes = h_dest_indexes + gpu_offset;
+
+    // TODO Check if we can increase parallelization by adding another omp
+    // clause here
+    for (uint j = 0; j < inputs_on_gpu; j++) {
+      auto d_dest = dest + dest_indexes[j] * elements_per_input;
+      auto d_src = src[i] + j * elements_per_input;
+
+      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                   elements_per_input * sizeof(Torus),
+                                   streams[i], gpu_indexes[i]);
     }
-    cuda_memcpy_async_gpu_to_gpu(
-        dest + dest_cpu_indexes[j] * elements_per_input,
-        src[gpu_index] + index_on_gpu * elements_per_input,
-        elements_per_input * sizeof(Torus), streams[gpu_index],
-        gpu_indexes[gpu_index]);
   }
-  for (uint i = 0; i < active_gpu_count; i++) {
+
+  for (uint i = 0; i < active_gpu_count; i++)
     cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-  }
-  free(dest_cpu_indexes);
+  free(h_dest_indexes);
 }
 
 template <typename Torus>
-void multi_gpu_release(cudaStream_t *streams, uint32_t *gpu_indexes,
-                       std::vector<Torus *> &vec) {
+void multi_gpu_lwe_release(cudaStream_t *streams, uint32_t *gpu_indexes,
+                           std::vector<Torus *> &vec) {
 
 #pragma omp parallel for num_threads(vec.size())
   for (uint i = 0; i < vec.size(); i++) {
     cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
     cuda_synchronize_stream(streams[i], gpu_indexes[i]);
   }
+  vec.clear();
 }
 
 #endif