Skip to content

Commit

Permalink
chore(gpu): remove stream callbacks
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jul 18, 2024
1 parent f8d8cc9 commit f60bbfe
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 39 deletions.
6 changes: 0 additions & 6 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,8 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);

int cuda_get_max_shared_memory(uint32_t gpu_index);

void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
cudaStreamCallback_t callback, void *user_data);
}

void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer);

template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);
Expand Down
22 changes: 10 additions & 12 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -565,8 +565,6 @@ template <typename Torus> struct int_radix_lut {
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);

// Keyswitch
Torus big_size =
Expand All @@ -577,6 +575,8 @@ template <typename Torus> struct int_radix_lut {
(Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
tmp_lwe_after_ks =
(Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lwe_indexes);
}
}

Expand Down Expand Up @@ -644,8 +644,8 @@ template <typename Torus> struct int_radix_lut {
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lwe_indexes);
}

// Return a pointer to idx-ith lut at gpu_index's global memory
Expand Down Expand Up @@ -770,8 +770,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lut_indexes);

/**
* the input indexes should take the first bits_per_block PBS to target
Expand All @@ -788,8 +786,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
num_radix_blocks * bits_per_block *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_in);

/**
* the output should aim different lwe ciphertexts, so lwe_indexes_out =
Expand All @@ -805,8 +801,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
num_radix_blocks * bits_per_block *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_out);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lut_indexes);
free(h_lwe_indexes_in);
free(h_lwe_indexes_out);
}
}

Expand Down Expand Up @@ -1012,8 +1010,6 @@ template <typename Torus> struct int_fullprop_buffer {
Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0);
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
streams[0], gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);

lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

Expand All @@ -1028,6 +1024,8 @@ template <typename Torus> struct int_fullprop_buffer {
small_vector_size, streams[0], gpu_indexes[0]);
tmp_big_lwe_vector = (Torus *)cuda_malloc_async(
big_vector_size, streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lwe_indexes);
}
}

Expand Down
12 changes: 0 additions & 12 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -248,15 +248,3 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
check_cuda_error(cudaGetLastError());
return max_shared_memory;
}

void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
cudaStreamCallback_t callback, void *user_data) {

check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
}

void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer) {
free(host_pointer);
}
15 changes: 6 additions & 9 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -340,9 +340,8 @@ void generate_device_accumulator_bivariate(
sizeof(Torus),
stream, gpu_index);

// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}

/*
Expand Down Expand Up @@ -374,9 +373,8 @@ void generate_device_accumulator_bivariate_with_factor(
sizeof(Torus),
stream, gpu_index);

// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}

/*
Expand Down Expand Up @@ -408,9 +406,8 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index);

// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}

template <typename Torus>
Expand Down

0 comments on commit f60bbfe

Please sign in to comment.