chore(gpu): add option to pre-release some buffers in scalar mul

zama-ai · Dec 20, 2024 · 9b43a94 · 9b43a94
1 parent 33d5091
commit 9b43a94
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 5 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -4271,12 +4271,15 @@ template <typename Torus> struct int_scalar_mul_buffer {
   Torus *preshifted_buffer;
   Torus *all_shifted_buffer;
   int_sc_prop_memory<Torus> *sc_prop_mem;
+  bool anticipated_buffers_drop;
 
   int_scalar_mul_buffer(cudaStream_t const *streams,
                         uint32_t const *gpu_indexes, uint32_t gpu_count,
                         int_radix_params params, uint32_t num_radix_blocks,
-                        bool allocate_gpu_memory) {
+                        bool allocate_gpu_memory,
+                        bool anticipated_buffer_drop) {
     this->params = params;
+    this->anticipated_buffers_drop = anticipated_buffer_drop;
 
     if (allocate_gpu_memory) {
       uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus);
@@ -4324,6 +4327,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
     delete sum_ciphertexts_vec_mem;
     delete sc_prop_mem;
     cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]);
+    if (!anticipated_buffers_drop) {
+      cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+      logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
+      delete (logical_scalar_shift_buffer);
+    }
   }
 };
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -36,7 +36,7 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
 
   *mem_ptr =
       new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
-                                   num_radix_blocks, allocate_gpu_memory);
+                                   num_radix_blocks, allocate_gpu_memory, true);
 }
 
 template <typename T, class params>
@@ -94,9 +94,11 @@ __host__ void host_integer_scalar_mul_radix(
   }
   cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
-  cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
-  mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
-  delete (mem->logical_scalar_shift_buffer);
+  if (mem->anticipated_buffers_drop) {
+    cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+    mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
+    delete (mem->logical_scalar_shift_buffer);
+  }
 
   if (j == 0) {
     // lwe array = 0