From 5eaf4fe5e0f508761d72cbe6ae9f36d1128224fd Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Wed, 26 Jun 2024 18:40:38 +0000
Subject: [PATCH] refactor(gpu): add a parameter to enable optimizations in
 case lwe_indexes_(in/out) is trivial

---
 .../tfhe-cuda-backend/cuda/include/integer.h  |  61 ++++----
 .../cuda/src/integer/integer.cuh              |  33 ++--
 .../cuda/src/integer/multiplication.cuh       |  34 ++---
 .../cuda/src/utils/helper_multi_gpu.cuh       | 141 ++++++------------
 4 files changed, 109 insertions(+), 160 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index be2061e339..f79feb759a 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -463,6 +463,8 @@ template <typename Torus> struct int_radix_lut {
   Torus *lwe_indexes_out;
   Torus *h_lwe_indexes_in;
   Torus *h_lwe_indexes_out;
+  // Enable optimizations if lwe_indexes_(in/out) are trivial
+  bool using_trivial_lwe_indexes = true;
   // lwe_trivial_indexes is the intermediary index we need in case
   // lwe_indexes_in != lwe_indexes_out
   Torus *lwe_trivial_indexes;
@@ -537,22 +539,20 @@ template <typename Torus> struct int_radix_lut {
       h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
       h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
-      auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-
       for (int i = 0; i < num_radix_blocks; i++)
-        h_lwe_indexes[i] = i;
+        h_lwe_indexes_in[i] = i;
 
-      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes);
+      memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+             num_radix_blocks * sizeof(Torus));
 
       /// With multiple GPUs we allocate arrays to be pushed to the vectors and
       /// copy data on each GPU then when we gather data to GPU 0 we can copy
@@ -641,22 +641,20 @@ template <typename Torus> struct int_radix_lut {
     h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
     h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
-    auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-
     for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
+      h_lwe_indexes_in[i] = i;
 
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                             host_free_on_stream_callback, h_lwe_indexes);
+    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+           num_radix_blocks * sizeof(Torus));
   }
 
   // Return a pointer to idx-ith lut at gpu_index's global memory
@@ -674,6 +672,22 @@ template <typename Torus> struct int_radix_lut {
     return &lut_indexes[ind];
   }
 
+  // If this function is called we assume the lwe_indexes_(in/out) are not the
+  // trivial anymore and thus we disable optimizations
+  void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
+                       Torus *h_indexes_in, Torus *h_indexes_out) {
+
+    memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
+    memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
+
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+
+    using_trivial_lwe_indexes = false;
+  }
+
   // Broadcast luts from gpu src_gpu_idx to all active gpus
   void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t src_gpu_idx) {
@@ -805,12 +819,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
         for (int i = 0; i < bits_per_block; i++)
           h_lwe_indexes_in[i + j * bits_per_block] = j;
       }
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes_in);
 
       /**
        * the output should aim different lwe ciphertexts, so lwe_indexes_out =
@@ -822,10 +830,11 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
       for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
         h_lwe_indexes_out[i] = i;
 
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
+      lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
+                           h_lwe_indexes_out);
+
+      cuda_stream_add_callback(streams[0], gpu_indexes[0],
+                               host_free_on_stream_callback, h_lwe_indexes_in);
       cuda_stream_add_callback(streams[0], gpu_indexes[0],
                                host_free_on_stream_callback, h_lwe_indexes_out);
     }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 9856a19d18..3f28fdf78b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -185,9 +185,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
 
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
-    multi_gpu_lwe_trivial_scatter<Torus>(
+    multi_gpu_lwe_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -207,9 +208,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_lwe_trivial_gather<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                lut->using_trivial_lwe_indexes,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
@@ -271,13 +273,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
   } else {
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    //    multi_gpu_lwe_scatter<Torus>(
-    //        streams, gpu_indexes, gpu_count, lwe_array_in_vec,
-    //        lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks,
-    //        big_lwe_dimension + 1, false);
-    multi_gpu_lwe_trivial_scatter<Torus>(
+    multi_gpu_lwe_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -297,14 +296,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    //    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
-    //    lwe_array_out,
-    //                                lwe_after_pbs_vec, h_lwe_indexes_out,
-    //                                num_radix_blocks, big_lwe_dimension + 1,
-    //                                false);
-    multi_gpu_lwe_trivial_gather<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                lut->using_trivial_lwe_indexes,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index c0b0690692..10a2674d0d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -366,19 +366,13 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
           mem_ptr->params.pbs_type, true);
     } else {
-      auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in;
-      auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out;
-      cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in,
-                               total_count * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out,
-                               total_count * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
-      multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                                   new_blocks_vec, new_blocks, h_lwe_indexes_in,
-                                   message_count, big_lwe_size, false);
+      multi_gpu_lwe_scatter<Torus>(
+          streams, gpu_indexes, gpu_count, new_blocks_vec, new_blocks,
+          luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          big_lwe_size, false);
 
       /// Apply KS to go from a big LWE dimension to a small LWE dimension
       /// After this keyswitch execution, we need to synchronize the streams
@@ -394,13 +388,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
       /// different configuration
       multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
                                   small_lwe_vector, small_lwe_vector_vec,
-                                  h_lwe_indexes_in, message_count,
-                                  small_lwe_size);
+                                  luts_message_carry->h_lwe_indexes_in,
+                                  luts_message_carry->using_trivial_lwe_indexes,
+                                  message_count, small_lwe_size);
 
-      multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                                   small_lwe_vector_vec, small_lwe_vector,
-                                   h_lwe_indexes_in, total_count,
-                                   small_lwe_size, false);
+      multi_gpu_lwe_scatter<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          small_lwe_size, false);
 
       /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
       /// dimension to a big LWE dimension
@@ -415,7 +411,9 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           mem_ptr->params.pbs_type, false);
 
       multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
-                                  lwe_after_pbs_vec, h_lwe_indexes_out,
+                                  lwe_after_pbs_vec,
+                                  luts_message_carry->h_lwe_indexes_out,
+                                  luts_message_carry->using_trivial_lwe_indexes,
                                   total_count, big_lwe_size);
     }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
index 45df2ff808..62713c69eb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -74,7 +74,8 @@ template <typename Torus>
 void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
                            uint32_t gpu_count, std::vector<Torus *> &dest,
                            Torus *src, Torus *h_src_indexes,
-                           uint32_t num_inputs, uint32_t elements_per_input,
+                           bool is_trivial_index, uint32_t num_inputs,
+                           uint32_t elements_per_input,
                            bool sync_threads = true) {
 
   auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
@@ -90,17 +91,27 @@ void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
     for (uint j = 0; j < i; j++) {
       gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
     }
-    auto src_indexes = h_src_indexes + gpu_offset;
 
-    // TODO Check if we can increase parallelization by adding another omp
-    // clause here
-    for (uint j = 0; j < inputs_on_gpu; j++) {
-      auto d_dest = dest[i] + j * elements_per_input;
-      auto d_src = src + src_indexes[j] * elements_per_input;
-
-      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
-                                   elements_per_input * sizeof(Torus),
-                                   streams[i], gpu_indexes[i]);
+    if (is_trivial_index) {
+      auto d_dest = dest[i];
+      auto d_src = src + gpu_offset * elements_per_input;
+      cuda_memcpy_async_gpu_to_gpu(
+          d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus),
+          streams[i], gpu_indexes[i]);
+
+    } else {
+      auto src_indexes = h_src_indexes + gpu_offset;
+
+      // TODO Check if we can increase parallelization by adding another omp
+      // clause here
+      for (uint j = 0; j < inputs_on_gpu; j++) {
+        auto d_dest = dest[i] + j * elements_per_input;
+        auto d_src = src + src_indexes[j] * elements_per_input;
+
+        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                     elements_per_input * sizeof(Torus),
+                                     streams[i], gpu_indexes[i]);
+      }
     }
   }
 
@@ -116,8 +127,8 @@ template <typename Torus>
 void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
                           uint32_t gpu_count, Torus *dest,
                           const std::vector<Torus *> &src,
-                          Torus *h_dest_indexes, uint32_t num_inputs,
-                          uint32_t elements_per_input,
+                          Torus *h_dest_indexes, bool is_trivial_index,
+                          uint32_t num_inputs, uint32_t elements_per_input,
                           bool sync_threads = true) {
 
   auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
@@ -132,17 +143,27 @@ void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
     for (uint j = 0; j < i; j++) {
       gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
     }
-    auto dest_indexes = h_dest_indexes + gpu_offset;
-
-    // TODO Check if we can increase parallelization by adding another omp
-    // clause here
-    for (uint j = 0; j < inputs_on_gpu; j++) {
-      auto d_dest = dest + dest_indexes[j] * elements_per_input;
-      auto d_src = src[i] + j * elements_per_input;
 
-      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
-                                   elements_per_input * sizeof(Torus),
-                                   streams[i], gpu_indexes[i]);
+    if (is_trivial_index) {
+      auto d_dest = dest + gpu_offset * elements_per_input;
+      auto d_src = src[i];
+
+      cuda_memcpy_async_gpu_to_gpu(
+          d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus),
+          streams[i], gpu_indexes[i]);
+    } else {
+      auto dest_indexes = h_dest_indexes + gpu_offset;
+
+      // TODO Check if we can increase parallelization by adding another omp
+      // clause here
+      for (uint j = 0; j < inputs_on_gpu; j++) {
+        auto d_dest = dest + dest_indexes[j] * elements_per_input;
+        auto d_src = src[i] + j * elements_per_input;
+
+        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                     elements_per_input * sizeof(Torus),
+                                     streams[i], gpu_indexes[i]);
+      }
     }
   }
 
@@ -150,80 +171,6 @@ void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
     for (uint i = 0; i < active_gpu_count; i++)
       cuda_synchronize_stream(streams[i], gpu_indexes[i]);
 }
-/// Load an array residing on one GPU to all active gpus
-/// and split the array among them.
-/// The input and output indexing is always the trivial one
-template <typename Torus>
-void multi_gpu_lwe_trivial_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
-                                   uint32_t gpu_count,
-                                   std::vector<Torus *> &dest, Torus *src,
-                                   uint32_t num_inputs,
-                                   uint32_t elements_per_input,
-                                   bool sync_threads = true) {
-
-  auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
-
-  if (sync_threads)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-  dest.resize(active_gpu_count);
-
-#pragma omp parallel for num_threads(active_gpu_count)
-  for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
-    auto gpu_offset = 0;
-    for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
-    }
-
-    auto d_dest = dest[i];
-    auto d_src = src + gpu_offset * elements_per_input;
-    cuda_memcpy_async_gpu_to_gpu(
-        d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus),
-        streams[i], gpu_indexes[i]);
-  }
-
-  if (sync_threads)
-    for (uint i = 0; i < active_gpu_count; i++)
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-}
-
-/// Copy data from multiple GPUs back to GPU 0 following the indexing given in
-/// dest_indexes
-/// The input indexing should be the trivial one
-template <typename Torus>
-void multi_gpu_lwe_trivial_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
-                                  uint32_t gpu_count, Torus *dest,
-                                  const std::vector<Torus *> &src,
-                                  uint32_t num_inputs,
-                                  uint32_t elements_per_input,
-                                  bool sync_threads = true) {
-
-  auto active_gpu_count = get_active_gpu_count(num_inputs, gpu_count);
-
-  if (sync_threads)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-#pragma omp parallel for num_threads(active_gpu_count)
-  for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
-    auto gpu_offset = 0;
-    for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
-    }
-
-    auto d_dest = dest + gpu_offset * elements_per_input;
-    auto d_src = src[i];
-
-    cuda_memcpy_async_gpu_to_gpu(
-        d_dest, d_src, inputs_on_gpu * elements_per_input * sizeof(Torus),
-        streams[i], gpu_indexes[i]);
-  }
-
-  if (sync_threads)
-    for (uint i = 0; i < active_gpu_count; i++)
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-}
 template <typename Torus>
 void multi_gpu_lwe_release(cudaStream_t *streams, uint32_t *gpu_indexes,
                            std::vector<Torus *> &vec,