From f60bbfe3da4d69d3a98d05fa5e6426c90527a1ea Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 18 Jul 2024 11:10:54 +0200
Subject: [PATCH] chore(gpu): remove stream callbacks

---
 .../tfhe-cuda-backend/cuda/include/device.h   |  6 -----
 .../tfhe-cuda-backend/cuda/include/integer.h  | 22 +++++++++----------
 backends/tfhe-cuda-backend/cuda/src/device.cu | 12 ----------
 .../cuda/src/integer/integer.cuh              | 15 +++++--------
 4 files changed, 16 insertions(+), 39 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index e5d11b2bf4..203db79b82 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -64,14 +64,8 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
 
 int cuda_get_max_shared_memory(uint32_t gpu_index);
-
-void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
-                              cudaStreamCallback_t callback, void *user_data);
 }
 
-void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
-                                  void *host_pointer);
-
 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                           Torus *d_array, Torus value, Torus n);
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 9fccc4db45..c5425cd96f 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -565,8 +565,6 @@ template <typename Torus> struct int_radix_lut {
       cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes);
 
       // Keyswitch
       Torus big_size =
@@ -577,6 +575,8 @@ template <typename Torus> struct int_radix_lut {
           (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
       tmp_lwe_after_ks =
           (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      free(h_lwe_indexes);
     }
   }
 
@@ -644,8 +644,8 @@ template <typename Torus> struct int_radix_lut {
     cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                             host_free_on_stream_callback, h_lwe_indexes);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    free(h_lwe_indexes);
   }
 
   // Return a pointer to idx-ith lut at gpu_index's global memory
@@ -770,8 +770,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
           num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
           gpu_indexes[0]);
       lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lut_indexes);
 
       /**
        * the input indexes should take the first bits_per_block PBS to target
@@ -788,8 +786,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                                num_radix_blocks * bits_per_block *
                                    sizeof(Torus),
                                streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes_in);
 
       /**
        * the output should aim different lwe ciphertexts, so lwe_indexes_out =
@@ -805,8 +801,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                                num_radix_blocks * bits_per_block *
                                    sizeof(Torus),
                                streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes_out);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      free(h_lut_indexes);
+      free(h_lwe_indexes_in);
+      free(h_lwe_indexes_out);
     }
   }
 
@@ -1012,8 +1010,6 @@ template <typename Torus> struct int_fullprop_buffer {
       Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0);
       cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
                                streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes);
 
       lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
@@ -1028,6 +1024,8 @@ template <typename Torus> struct int_fullprop_buffer {
           small_vector_size, streams[0], gpu_indexes[0]);
       tmp_big_lwe_vector = (Torus *)cuda_malloc_async(
           big_vector_size, streams[0], gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      free(h_lwe_indexes);
     }
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index 7583da3c2a..b88e360d44 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -248,15 +248,3 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
   check_cuda_error(cudaGetLastError());
   return max_shared_memory;
 }
-
-void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
-                              cudaStreamCallback_t callback, void *user_data) {
-
-  check_cuda_error(cudaSetDevice(gpu_index));
-  check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
-}
-
-void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
-                                  void *host_pointer) {
-  free(host_pointer);
-}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 1805f43976..7e216ccffc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -340,9 +340,8 @@ void generate_device_accumulator_bivariate(
                                sizeof(Torus),
                            stream, gpu_index);
 
-  // Release memory when possible
-  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
-                           h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }
 
 /*
@@ -374,9 +373,8 @@ void generate_device_accumulator_bivariate_with_factor(
                                sizeof(Torus),
                            stream, gpu_index);
 
-  // Release memory when possible
-  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
-                           h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }
 
 /*
@@ -408,9 +406,8 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
       acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
       stream, gpu_index);
 
-  // Release memory when possible
-  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
-                           h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }
 
 template <typename Torus>