refactor(gpu): add a parameter to enable optimizations in case lwe_in…

…dexes_(in/out) is trivial
zama-ai · Jun 27, 2024 · 72ad8e6 · 72ad8e6
1 parent 1bfa71f
commit 72ad8e6
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 184 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -468,6 +468,8 @@ template <typename Torus> struct int_radix_lut {
   Torus *lwe_indexes_out;
   Torus *h_lwe_indexes_in;
   Torus *h_lwe_indexes_out;
+  // Enable optimizations if lwe_indexes_(in/out) are trivial
+  bool using_trivial_lwe_indexes = true;
   // lwe_trivial_indexes is the intermediary index we need in case
   // lwe_indexes_in != lwe_indexes_out
   Torus *lwe_trivial_indexes;
@@ -542,22 +544,20 @@ template <typename Torus> struct int_radix_lut {
       h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
       h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
-      auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-
       for (int i = 0; i < num_radix_blocks; i++)
-        h_lwe_indexes[i] = i;
+        h_lwe_indexes_in[i] = i;
 
-      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes);
+      memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+             num_radix_blocks * sizeof(Torus));
 
       /// With multiple GPUs we allocate arrays to be pushed to the vectors and
       /// copy data on each GPU then when we gather data to GPU 0 we can copy
@@ -646,22 +646,20 @@ template <typename Torus> struct int_radix_lut {
     h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
     h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
-    auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-
     for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
+      h_lwe_indexes_in[i] = i;
 
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                             host_free_on_stream_callback, h_lwe_indexes);
+    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+           num_radix_blocks * sizeof(Torus));
   }
 
   // Return a pointer to idx-ith lut at gpu_index's global memory
@@ -679,6 +677,22 @@ template <typename Torus> struct int_radix_lut {
     return &lut_indexes[ind];
   }
 
+  // If this function is called we assume the lwe_indexes_(in/out) are not the
+  // trivial anymore and thus we disable optimizations
+  void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
+                       Torus *h_indexes_in, Torus *h_indexes_out) {
+
+    memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
+    memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
+
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+
+    using_trivial_lwe_indexes = false;
+  }
+
   // Broadcast luts from gpu src_gpu_idx to all active gpus
   void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t src_gpu_idx) {
@@ -810,12 +824,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
         for (int i = 0; i < bits_per_block; i++)
           h_lwe_indexes_in[i + j * bits_per_block] = j;
       }
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes_in);
 
       /**
        * the output should aim different lwe ciphertexts, so lwe_indexes_out =
@@ -827,10 +835,11 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
       for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
         h_lwe_indexes_out[i] = i;
 
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
+      lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
+                           h_lwe_indexes_out);
+
+      cuda_stream_add_callback(streams[0], gpu_indexes[0],
+                               host_free_on_stream_callback, h_lwe_indexes_in);
       cuda_stream_add_callback(streams[0], gpu_indexes[0],
                                host_free_on_stream_callback, h_lwe_indexes_out);
     }
@@ -1747,19 +1756,23 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
 
   cudaStream_t *local_streams_1;
   cudaStream_t *local_streams_2;
+  uint32_t active_gpu_count;
 
   int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
                                      uint32_t *gpu_indexes, uint32_t gpu_count,
                                      SHIFT_OR_ROTATE_TYPE shift_type,
                                      int_radix_params params,
                                      uint32_t num_radix_blocks,
                                      bool allocate_gpu_memory) {
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
     // In the arithmetic shift, a PBS has to be applied to the last rotated
     // block twice: once to shift it, once to compute the padding block to be
     // copied onto all blocks to the left of the last rotated block
-    local_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    local_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    local_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    local_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
       local_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
       local_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
     }
@@ -1892,7 +1905,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
 
   void release(cudaStream_t *streams, uint32_t *gpu_indexes,
                uint32_t gpu_count) {
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
       cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
     }
@@ -1921,20 +1934,24 @@ template <typename Torus> struct int_zero_out_if_buffer {
 
   cudaStream_t *true_streams;
   cudaStream_t *false_streams;
+  uint32_t active_gpu_count;
 
   int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
                          uint32_t gpu_count, int_radix_params params,
                          uint32_t num_radix_blocks, bool allocate_gpu_memory) {
     this->params = params;
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
 
     Torus big_size =
         (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
     if (allocate_gpu_memory) {
       tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
       // We may use a different stream to allow concurrent operation
-      true_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      false_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < gpu_count; j++) {
+      true_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      false_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < active_gpu_count; j++) {
         true_streams[j] = cuda_create_stream(gpu_indexes[j]);
         false_streams[j] = cuda_create_stream(gpu_indexes[j]);
       }
@@ -1943,7 +1960,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
   void release(cudaStream_t *streams, uint32_t *gpu_indexes,
                uint32_t gpu_count) {
     cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
       cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
     }
@@ -2377,6 +2394,7 @@ template <typename Torus> struct int_comparison_buffer {
   int_radix_lut<Torus> *signed_msb_lut;
   cudaStream_t *lsb_streams;
   cudaStream_t *msb_streams;
+  uint32_t active_gpu_count;
 
   int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
                         uint32_t gpu_count, COMPARISON_TYPE op,
@@ -2386,14 +2404,18 @@ template <typename Torus> struct int_comparison_buffer {
     this->op = op;
     this->is_signed = is_signed;
 
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+
     identity_lut_f = [](Torus x) -> Torus { return x; };
 
     auto big_lwe_size = params.big_lwe_dimension + 1;
 
     if (allocate_gpu_memory) {
-      lsb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      msb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < gpu_count; j++) {
+      lsb_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      msb_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < active_gpu_count; j++) {
         lsb_streams[j] = cuda_create_stream(gpu_indexes[j]);
         msb_streams[j] = cuda_create_stream(gpu_indexes[j]);
       }
@@ -2557,7 +2579,7 @@ template <typename Torus> struct int_comparison_buffer {
       signed_msb_lut->release(streams, gpu_indexes, gpu_count);
       delete (signed_msb_lut);
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -2568,6 +2590,7 @@ template <typename Torus> struct int_comparison_buffer {
 
 template <typename Torus> struct int_div_rem_memory {
   int_radix_params params;
+  uint32_t active_gpu_count;
 
   // memory objects for other operations
   int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
@@ -2803,6 +2826,8 @@ template <typename Torus> struct int_div_rem_memory {
   int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t gpu_count, int_radix_params params,
                      uint32_t num_blocks, bool allocate_gpu_memory) {
+    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
+
     this->params = params;
     shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
         streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
@@ -2822,11 +2847,15 @@ template <typename Torus> struct int_div_rem_memory {
     init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
     init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
 
-    sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_3 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_4 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_3 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_4 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
       sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
       sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
       sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
@@ -2897,7 +2926,7 @@ template <typename Torus> struct int_div_rem_memory {
     delete[] merge_overflow_flags_luts;
 
     // release sub streams
-    for (uint i = 0; i < gpu_count; i++) {
+    for (uint i = 0; i < active_gpu_count; i++) {
       cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
       cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
       cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -185,9 +185,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
 
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
-    multi_gpu_lwe_trivial_scatter<Torus>(
+    multi_gpu_lwe_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -207,9 +208,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_lwe_trivial_gather<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                lut->using_trivial_lwe_indexes,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
@@ -271,13 +273,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
   } else {
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    //    multi_gpu_lwe_scatter<Torus>(
-    //        streams, gpu_indexes, gpu_count, lwe_array_in_vec,
-    //        lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks,
-    //        big_lwe_dimension + 1, false);
-    multi_gpu_lwe_trivial_scatter<Torus>(
+    multi_gpu_lwe_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -297,14 +296,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    //    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
-    //    lwe_array_out,
-    //                                lwe_after_pbs_vec, h_lwe_indexes_out,
-    //                                num_radix_blocks, big_lwe_dimension + 1,
-    //                                false);
-    multi_gpu_lwe_trivial_gather<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                lut->using_trivial_lwe_indexes,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {