refactor(gpu): implement a bypass to avoid wasting time with multi-gp…

…u logic in scatter/gather logic when we are on a single-gpu environment
zama-ai · Jun 27, 2024 · cf72e95 · cf72e95
1 parent 3276661
commit cf72e95
Show file tree

Hide file tree

Showing 3 changed files with 179 additions and 125 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -472,7 +472,6 @@ template <typename Torus> struct int_radix_lut {
   // lwe_indexes_in != lwe_indexes_out
   Torus *lwe_trivial_indexes;
   Torus *tmp_lwe_before_ks;
-  Torus *tmp_lwe_after_ks;
 
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
@@ -583,8 +582,6 @@ template <typename Torus> struct int_radix_lut {
           (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
       tmp_lwe_before_ks =
           (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
-      tmp_lwe_after_ks =
-          (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
     }
   }
 
@@ -606,7 +603,6 @@ template <typename Torus> struct int_radix_lut {
     buffer = base_lut_object->buffer;
     // Keyswitch
     tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
-    tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
 
     /// With multiple GPUs we allocate arrays to be pushed to the vectors and
     /// copy data on each GPU then when we gather data to GPU 0 we can copy back
@@ -729,7 +725,6 @@ template <typename Torus> struct int_radix_lut {
 
     if (!mem_reuse) {
       cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]);
-      cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]);
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
       for (int i = 0; i < buffer.size(); i++) {
         switch (params.pbs_type) {

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -164,44 +164,61 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
   std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+  if (gpu_count == 1) {
+    execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                             lwe_trivial_indexes_vec[0], lwe_array_in,
+                             lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                             small_lwe_dimension, ks_base_log, ks_level,
+                             num_radix_blocks, false);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+  } else {
+    auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
+    auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
+    cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
+                             num_radix_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
+                             num_radix_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
-  auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
-  auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
-  cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
-                           num_radix_blocks * sizeof(Torus), streams[0],
-                           gpu_indexes[0]);
-  cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
-                           num_radix_blocks * sizeof(Torus), streams[0],
-                           gpu_indexes[0]);
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-  /// With multiple GPUs we push to the vectors on each GPU then when we gather
-  /// data to GPU 0 we can copy back to the original indexing
-  multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                               lwe_array_in_vec, lwe_array_in, h_lwe_indexes_in,
-                               num_radix_blocks, big_lwe_dimension + 1, false);
-
-  /// Apply KS to go from a big LWE dimension to a small LWE dimension
-  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
-                           lwe_trivial_indexes_vec, lwe_array_in_vec,
-                           lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
-                           small_lwe_dimension, ks_base_log, ks_level,
-                           num_radix_blocks, false);
-
-  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-  /// dimension to a big LWE dimension
-  execute_pbs<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
-      lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
-      lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
-      glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-      pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
-
-  /// Copy data back to GPU 0 and release vecs
-  multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
-                              lwe_after_pbs_vec, h_lwe_indexes_out,
-                              num_radix_blocks, big_lwe_dimension + 1, false);
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_lwe_scatter<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
+        h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);
+
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
+                             lwe_trivial_indexes_vec, lwe_array_in_vec,
+                             lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
+                             small_lwe_dimension, ks_base_log, ks_level,
+                             num_radix_blocks, false);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, h_lwe_indexes_out,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
+  }
 
   /// Synchronize all GPUs
   auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
@@ -245,42 +262,59 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
   std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
   std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+  if (gpu_count == 1) {
+    execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                             lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
+                             lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                             small_lwe_dimension, ks_base_log, ks_level,
+                             num_radix_blocks, false);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+  } else {
+    auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
+    auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
+    cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
+                             num_radix_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
+                             num_radix_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
-  auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
-  auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
-  cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
-                           num_radix_blocks * sizeof(Torus), streams[0],
-                           gpu_indexes[0]);
-  cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
-                           num_radix_blocks * sizeof(Torus), streams[0],
-                           gpu_indexes[0]);
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    multi_gpu_lwe_scatter<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
+        h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);
 
-  multi_gpu_lwe_scatter<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
-      h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);
-
-  /// Apply KS to go from a big LWE dimension to a small LWE dimension
-  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
-                           lwe_trivial_indexes_vec, lwe_array_in_vec,
-                           lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
-                           small_lwe_dimension, ks_base_log, ks_level,
-                           num_radix_blocks, false);
-
-  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-  /// dimension to a big LWE dimension
-  execute_pbs<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
-      lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
-      lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
-      glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-      pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
-
-  /// Copy data back to GPU 0 and release vecs
-  multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
-                              lwe_after_pbs_vec, h_lwe_indexes_out,
-                              num_radix_blocks, big_lwe_dimension + 1, false);
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
+                             lwe_trivial_indexes_vec, lwe_array_in_vec,
+                             lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
+                             small_lwe_dimension, ks_base_log, ks_level,
+                             num_radix_blocks, false);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, h_lwe_indexes_out,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
+  }
 
   /// Synchronize all GPUs
   auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -343,55 +343,80 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
     std::vector<Torus *> lwe_trivial_indexes_vec =
         luts_message_carry->lwe_trivial_indexes_vec;
 
-    auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in;
-    auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out;
-    cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in,
-                             total_count * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
-    cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out,
-                             total_count * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-    multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                                 new_blocks_vec, new_blocks, h_lwe_indexes_in,
-                                 message_count, big_lwe_size, false);
-
-    /// Apply KS to go from a big LWE dimension to a small LWE dimension
-    /// After this keyswitch execution, we need to synchronize the streams
-    /// because the keyswitch and PBS do not operate on the same number of
-    /// inputs
-    execute_keyswitch<Torus>(
-        streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
-        lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, ksks,
-        big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log,
-        mem_ptr->params.ks_level, message_count, false);
-
-    /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
-    /// different configuration
-    multi_gpu_lwe_gather<Torus>(
-        streams, gpu_indexes, gpu_count, small_lwe_vector, small_lwe_vector_vec,
-        h_lwe_indexes_in, message_count, small_lwe_size);
-
-    multi_gpu_lwe_scatter<Torus>(
-        streams, gpu_indexes, gpu_count, small_lwe_vector_vec, small_lwe_vector,
-        h_lwe_indexes_in, total_count, small_lwe_size, false);
-
-    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-    /// dimension to a big LWE dimension
-    execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
-                       lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
-                       luts_message_carry->lut_indexes_vec,
-                       small_lwe_vector_vec, lwe_trivial_indexes_vec, bsks,
-                       luts_message_carry->buffer, glwe_dimension,
-                       lwe_dimension, polynomial_size,
-                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-                       mem_ptr->params.grouping_factor, total_count, 2, 0,
-                       max_shared_memory, mem_ptr->params.pbs_type, false);
-
-    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
-                                lwe_after_pbs_vec, h_lwe_indexes_out,
-                                total_count, big_lwe_size);
+    if (gpu_count == 1) {
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector, lwe_indexes_in,
+          new_blocks, lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
+          lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
+          message_count, true);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs<Torus>(
+          streams, gpu_indexes, gpu_count, new_blocks, lwe_indexes_out,
+          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
+          small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
+          glwe_dimension, lwe_dimension, polynomial_size,
+          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+          mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
+          mem_ptr->params.pbs_type, true);
+    } else {
+      auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in;
+      auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out;
+      cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in,
+                               total_count * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out,
+                               total_count * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+      multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
+                                   new_blocks_vec, new_blocks, h_lwe_indexes_in,
+                                   message_count, big_lwe_size, false);
+
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
+          ksks, big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log,
+          mem_ptr->params.ks_level, message_count, false);
+
+      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
+      /// different configuration
+      multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
+                                  small_lwe_vector, small_lwe_vector_vec,
+                                  h_lwe_indexes_in, message_count,
+                                  small_lwe_size);
+
+      multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
+                                   small_lwe_vector_vec, small_lwe_vector,
+                                   h_lwe_indexes_in, total_count,
+                                   small_lwe_size, false);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs<Torus>(
+          streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
+          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
+          glwe_dimension, lwe_dimension, polynomial_size,
+          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+          mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
+          mem_ptr->params.pbs_type, false);
+
+      multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
+                                  lwe_after_pbs_vec, h_lwe_indexes_out,
+                                  total_count, big_lwe_size);
+    }
 
     luts_message_carry->release(streams, gpu_indexes, gpu_count);