refactor(gpu): add a parameter to enable optimizations in case lwe_in…

…dexes_(in/out) is trivial
zama-ai · Jun 26, 2024 · 5eaf4fe · 5eaf4fe
1 parent 995c3e1
commit 5eaf4fe
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 160 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -463,6 +463,8 @@ template <typename Torus> struct int_radix_lut {
   Torus *lwe_indexes_out;
   Torus *h_lwe_indexes_in;
   Torus *h_lwe_indexes_out;
+  // Enable optimizations if lwe_indexes_(in/out) are trivial
+  bool using_trivial_lwe_indexes = true;
   // lwe_trivial_indexes is the intermediary index we need in case
   // lwe_indexes_in != lwe_indexes_out
   Torus *lwe_trivial_indexes;
@@ -537,22 +539,20 @@ template <typename Torus> struct int_radix_lut {
       h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
       h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
-      auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-
       for (int i = 0; i < num_radix_blocks; i++)
-        h_lwe_indexes[i] = i;
+        h_lwe_indexes_in[i] = i;
 
-      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes);
+      memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+             num_radix_blocks * sizeof(Torus));
 
       /// With multiple GPUs we allocate arrays to be pushed to the vectors and
       /// copy data on each GPU then when we gather data to GPU 0 we can copy
@@ -641,22 +641,20 @@ template <typename Torus> struct int_radix_lut {
     h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
     h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
-    auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-
     for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
+      h_lwe_indexes_in[i] = i;
 
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                             host_free_on_stream_callback, h_lwe_indexes);
+    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+           num_radix_blocks * sizeof(Torus));
   }
 
   // Return a pointer to idx-ith lut at gpu_index's global memory
@@ -674,6 +672,22 @@ template <typename Torus> struct int_radix_lut {
     return &lut_indexes[ind];
   }
 
+  // If this function is called we assume the lwe_indexes_(in/out) are not the
+  // trivial anymore and thus we disable optimizations
+  void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
+                       Torus *h_indexes_in, Torus *h_indexes_out) {
+
+    memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
+    memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
+
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+
+    using_trivial_lwe_indexes = false;
+  }
+
   // Broadcast luts from gpu src_gpu_idx to all active gpus
   void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t src_gpu_idx) {
@@ -805,12 +819,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
         for (int i = 0; i < bits_per_block; i++)
           h_lwe_indexes_in[i + j * bits_per_block] = j;
       }
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes_in);
 
       /**
        * the output should aim different lwe ciphertexts, so lwe_indexes_out =
@@ -822,10 +830,11 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
       for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
         h_lwe_indexes_out[i] = i;
 
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
+      lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
+                           h_lwe_indexes_out);
+
+      cuda_stream_add_callback(streams[0], gpu_indexes[0],
+                               host_free_on_stream_callback, h_lwe_indexes_in);
       cuda_stream_add_callback(streams[0], gpu_indexes[0],
                                host_free_on_stream_callback, h_lwe_indexes_out);
     }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -185,9 +185,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
 
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
-    multi_gpu_lwe_trivial_scatter<Torus>(
+    multi_gpu_lwe_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -207,9 +208,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_lwe_trivial_gather<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                lut->using_trivial_lwe_indexes,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
@@ -271,13 +273,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
   } else {
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    //    multi_gpu_lwe_scatter<Torus>(
-    //        streams, gpu_indexes, gpu_count, lwe_array_in_vec,
-    //        lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks,
-    //        big_lwe_dimension + 1, false);
-    multi_gpu_lwe_trivial_scatter<Torus>(
+    multi_gpu_lwe_scatter<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1, false);
 
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
@@ -297,14 +296,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
 
     /// Copy data back to GPU 0 and release vecs
-    //    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
-    //    lwe_array_out,
-    //                                lwe_after_pbs_vec, h_lwe_indexes_out,
-    //                                num_radix_blocks, big_lwe_dimension + 1,
-    //                                false);
-    multi_gpu_lwe_trivial_gather<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
-        num_radix_blocks, big_lwe_dimension + 1, false);
+    multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                lut->using_trivial_lwe_indexes,
+                                num_radix_blocks, big_lwe_dimension + 1, false);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -366,19 +366,13 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
           mem_ptr->params.pbs_type, true);
     } else {
-      auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in;
-      auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out;
-      cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in,
-                               total_count * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out,
-                               total_count * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
-      multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                                   new_blocks_vec, new_blocks, h_lwe_indexes_in,
-                                   message_count, big_lwe_size, false);
+      multi_gpu_lwe_scatter<Torus>(
+          streams, gpu_indexes, gpu_count, new_blocks_vec, new_blocks,
+          luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          big_lwe_size, false);
 
       /// Apply KS to go from a big LWE dimension to a small LWE dimension
       /// After this keyswitch execution, we need to synchronize the streams
@@ -394,13 +388,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
       /// different configuration
       multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
                                   small_lwe_vector, small_lwe_vector_vec,
-                                  h_lwe_indexes_in, message_count,
-                                  small_lwe_size);
+                                  luts_message_carry->h_lwe_indexes_in,
+                                  luts_message_carry->using_trivial_lwe_indexes,
+                                  message_count, small_lwe_size);
 
-      multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
-                                   small_lwe_vector_vec, small_lwe_vector,
-                                   h_lwe_indexes_in, total_count,
-                                   small_lwe_size, false);
+      multi_gpu_lwe_scatter<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          small_lwe_size, false);
 
       /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
       /// dimension to a big LWE dimension
@@ -415,7 +411,9 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           mem_ptr->params.pbs_type, false);
 
       multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
-                                  lwe_after_pbs_vec, h_lwe_indexes_out,
+                                  lwe_after_pbs_vec,
+                                  luts_message_carry->h_lwe_indexes_out,
+                                  luts_message_carry->using_trivial_lwe_indexes,
                                   total_count, big_lwe_size);
     }