From 1bfa71f81e583defa261972d32ab6f9a2ae94592 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Wed, 26 Jun 2024 15:05:11 +0000
Subject: [PATCH] refactor(gpu): modify apply lut methods to assume inputs and
 outputs have trivial indexing

---
 .../tfhe-cuda-backend/cuda/include/integer.h  |  2 +-
 .../cuda/src/integer/integer.cuh              | 80 ++++++++-----------
 .../cuda/src/integer/multiplication.cuh       | 19 ++---
 .../cuda/src/utils/helper_multi_gpu.cuh       | 20 ++---
 4 files changed, 56 insertions(+), 65 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 6d5c1920cb..34ee0440a2 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -498,7 +498,7 @@ template <typename Torus> struct int_radix_lut {
       cudaSetDevice(i);
       int8_t *gpu_pbs_buffer;
       auto num_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count);
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
 
       execute_scratch_pbs<Torus>(
           streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index ccf141c0e7..03f7d0c706 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -155,16 +155,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   auto polynomial_size = params.polynomial_size;
   auto grouping_factor = params.grouping_factor;
 
-  /// Make sure all data that should be on GPU 0 is indeed there
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
   std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
   std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
   std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
-  if (gpu_count == 1) {
+
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  if (active_gpu_count == 1) {
     execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
                              lwe_trivial_indexes_vec[0], lwe_array_in,
                              lut->lwe_indexes_in, ksks, big_lwe_dimension,
@@ -181,21 +180,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         grouping_factor, num_radix_blocks, 1, 0,
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
   } else {
-    auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
-    auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
-    cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
-                             num_radix_blocks * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
-    cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
-                             num_radix_blocks * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
+    /// Make sure all data that should be on GPU 0 is indeed there
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
-    multi_gpu_lwe_scatter<Torus>(
+    multi_gpu_lwe_trivial_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
-        h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);
+        num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -215,15 +207,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                lwe_after_pbs_vec, h_lwe_indexes_out,
-                                num_radix_blocks, big_lwe_dimension + 1, false);
-  }
+    multi_gpu_lwe_trivial_gather<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
+        num_radix_blocks, big_lwe_dimension + 1, false);
 
-  /// Synchronize all GPUs
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
   }
 }
 
@@ -254,15 +245,15 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
                         num_radix_blocks);
   check_cuda_error(cudaGetLastError());
 
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
   std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
   std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
   std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
-  if (gpu_count == 1) {
+
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  if (active_gpu_count == 1) {
     execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
                              lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
                              lut->lwe_indexes_in, ksks, big_lwe_dimension,
@@ -279,19 +270,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         grouping_factor, num_radix_blocks, 1, 0,
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
   } else {
-    auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
-    auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
-    cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
-                             num_radix_blocks * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
-    cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
-                             num_radix_blocks * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-    multi_gpu_lwe_scatter<Torus>(
+    //    multi_gpu_lwe_scatter<Torus>(
+    //        streams, gpu_indexes, gpu_count, lwe_array_in_vec,
+    //        lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks,
+    //        big_lwe_dimension + 1, false);
+    multi_gpu_lwe_trivial_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
-        h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);
+        num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -311,15 +297,19 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                lwe_after_pbs_vec, h_lwe_indexes_out,
-                                num_radix_blocks, big_lwe_dimension + 1, false);
-  }
-
-  /// Synchronize all GPUs
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    //    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
+    //    lwe_array_out,
+    //                                lwe_after_pbs_vec, h_lwe_indexes_out,
+    //                                num_radix_blocks, big_lwe_dimension + 1,
+    //                                false);
+    multi_gpu_lwe_trivial_gather<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
+        num_radix_blocks, big_lwe_dimension + 1, false);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index fe6c6a007e..b2a1c79dff 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -331,8 +331,6 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
 
     luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
-    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
-
     /// For multi GPU execution we create vectors of pointers for inputs and
     /// outputs
     std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
@@ -343,21 +341,24 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
     std::vector<Torus *> lwe_trivial_indexes_vec =
         luts_message_carry->lwe_trivial_indexes_vec;
 
-    if (gpu_count == 1) {
+    // TODO We should be able to run keyswitch on a single GPU and PBS on
+    // multiple GPUs if needed
+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
+    if (active_gpu_count == 1) {
       /// Apply KS to go from a big LWE dimension to a small LWE dimension
       /// After this keyswitch execution, we need to synchronize the streams
       /// because the keyswitch and PBS do not operate on the same number of
       /// inputs
-      execute_keyswitch<Torus>(
-          streams, gpu_indexes, gpu_count, small_lwe_vector, lwe_indexes_in,
-          new_blocks, lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
-          lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
-          message_count, true);
+      execute_keyswitch<Torus>(streams, gpu_indexes, 1, small_lwe_vector,
+                               lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
+                               polynomial_size * glwe_dimension, lwe_dimension,
+                               mem_ptr->params.ks_base_log,
+                               mem_ptr->params.ks_level, message_count, true);
 
       /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
       /// dimension to a big LWE dimension
       execute_pbs<Torus>(
-          streams, gpu_indexes, gpu_count, new_blocks, lwe_indexes_out,
+          streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
           luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
           small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
           glwe_dimension, lwe_dimension, polynomial_size,
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
index 937f4656a1..45df2ff808 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -55,7 +55,7 @@ void multi_gpu_lwe_init(cudaStream_t *streams, uint32_t *gpu_indexes,
   dest.resize(active_gpu_count);
 #pragma omp parallel for num_threads(active_gpu_count)
   for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
     Torus *d_array = (Torus *)cuda_malloc_async(
         inputs_on_gpu * elements_per_input * sizeof(Torus), streams[i],
         gpu_indexes[i]);
@@ -83,12 +83,12 @@ void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
   dest.resize(active_gpu_count);
-  // #pragma omp parallel for num_threads(active_gpu_count)
+#pragma omp parallel for num_threads(active_gpu_count)
   for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
     auto gpu_offset = 0;
     for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
     }
     auto src_indexes = h_src_indexes + gpu_offset;
 
@@ -127,10 +127,10 @@ void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
 
 #pragma omp parallel for num_threads(active_gpu_count)
   for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
     auto gpu_offset = 0;
     for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
     }
     auto dest_indexes = h_dest_indexes + gpu_offset;
 
@@ -170,10 +170,10 @@ void multi_gpu_lwe_trivial_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
 
 #pragma omp parallel for num_threads(active_gpu_count)
   for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
     auto gpu_offset = 0;
     for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
     }
 
     auto d_dest = dest[i];
@@ -206,10 +206,10 @@ void multi_gpu_lwe_trivial_gather(cudaStream_t *streams, uint32_t *gpu_indexes,
 
 #pragma omp parallel for num_threads(active_gpu_count)
   for (uint i = 0; i < active_gpu_count; i++) {
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
     auto gpu_offset = 0;
     for (uint j = 0; j < i; j++) {
-      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
     }
 
     auto d_dest = dest + gpu_offset * elements_per_input;