From f60bbfe3da4d69d3a98d05fa5e6426c90527a1ea Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 18 Jul 2024 11:10:54 +0200 Subject: [PATCH] chore(gpu): remove stream callbacks --- .../tfhe-cuda-backend/cuda/include/device.h | 6 ----- .../tfhe-cuda-backend/cuda/include/integer.h | 22 +++++++++---------- backends/tfhe-cuda-backend/cuda/src/device.cu | 12 ---------- .../cuda/src/integer/integer.cuh | 15 +++++-------- 4 files changed, 16 insertions(+), 39 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index e5d11b2bf4..203db79b82 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -64,14 +64,8 @@ void cuda_drop(void *ptr, uint32_t gpu_index); void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index); int cuda_get_max_shared_memory(uint32_t gpu_index); - -void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index, - cudaStreamCallback_t callback, void *user_data); } -void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status, - void *host_pointer); - template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index, Torus *d_array, Torus value, Torus n); diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 9fccc4db45..c5425cd96f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -565,8 +565,6 @@ template struct int_radix_lut { cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes); // Keyswitch Torus big_size = @@ -577,6 +575,8 @@ template struct int_radix_lut { (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); tmp_lwe_after_ks = (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lwe_indexes); } } @@ -644,8 +644,8 @@ template struct int_radix_lut { cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes, num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lwe_indexes); } // Return a pointer to idx-ith lut at gpu_index's global memory @@ -770,8 +770,6 @@ template struct int_bit_extract_luts_buffer { num_radix_blocks * bits_per_block * sizeof(Torus), streams[0], gpu_indexes[0]); lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lut_indexes); /** * the input indexes should take the first bits_per_block PBS to target @@ -788,8 +786,6 @@ template struct int_bit_extract_luts_buffer { num_radix_blocks * bits_per_block * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes_in); /** * the output should aim different lwe ciphertexts, so lwe_indexes_out = @@ -805,8 +801,10 @@ template struct int_bit_extract_luts_buffer { num_radix_blocks * bits_per_block * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes_out); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lut_indexes); + free(h_lwe_indexes_in); + free(h_lwe_indexes_out); } } @@ -1012,8 +1010,6 @@ template struct int_fullprop_buffer { Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0); cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams[0], gpu_indexes[0]); - cuda_stream_add_callback(streams[0], gpu_indexes[0], - host_free_on_stream_callback, h_lwe_indexes); lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); @@ -1028,6 +1024,8 @@ template struct int_fullprop_buffer { small_vector_size, streams[0], gpu_indexes[0]); tmp_big_lwe_vector = (Torus *)cuda_malloc_async( big_vector_size, streams[0], gpu_indexes[0]); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + free(h_lwe_indexes); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 7583da3c2a..b88e360d44 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -248,15 +248,3 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) { check_cuda_error(cudaGetLastError()); return max_shared_memory; } - -void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index, - cudaStreamCallback_t callback, void *user_data) { - - check_cuda_error(cudaSetDevice(gpu_index)); - check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0)); -} - -void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status, - void *host_pointer) { - free(host_pointer); -} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 1805f43976..7e216ccffc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -340,9 +340,8 @@ void generate_device_accumulator_bivariate( sizeof(Torus), stream, gpu_index); - // Release memory when possible - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lut); + cuda_synchronize_stream(stream, gpu_index); + free(h_lut); } /* @@ -374,9 +373,8 @@ void generate_device_accumulator_bivariate_with_factor( sizeof(Torus), stream, gpu_index); - // Release memory when possible - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lut); + cuda_synchronize_stream(stream, gpu_index); + free(h_lut); } /* @@ -408,9 +406,8 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index); - // Release memory when possible - cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback, - h_lut); + cuda_synchronize_stream(stream, gpu_index); + free(h_lut); } template