Skip to content

Commit

Permalink
refactor(gpu): add a parameter to enable optimizations in case lwe_in…
Browse files Browse the repository at this point in the history
…dexes_(in/out) is trivial
  • Loading branch information
pdroalves authored and agnesLeroy committed Jun 27, 2024
1 parent 1bfa71f commit 72ad8e6
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 184 deletions.
117 changes: 73 additions & 44 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,8 @@ template <typename Torus> struct int_radix_lut {
Torus *lwe_indexes_out;
Torus *h_lwe_indexes_in;
Torus *h_lwe_indexes_out;
// Enable optimizations if lwe_indexes_(in/out) are trivial
bool using_trivial_lwe_indexes = true;
// lwe_trivial_indexes is the intermediary index we need in case
// lwe_indexes_in != lwe_indexes_out
Torus *lwe_trivial_indexes;
Expand Down Expand Up @@ -542,22 +544,20 @@ template <typename Torus> struct int_radix_lut {
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
h_lwe_indexes_in[i] = i;

cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus));

/// With multiple GPUs we allocate arrays to be pushed to the vectors and
/// copy data on each GPU then when we gather data to GPU 0 we can copy
Expand Down Expand Up @@ -646,22 +646,20 @@ template <typename Torus> struct int_radix_lut {
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
h_lwe_indexes_in[i] = i;

cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus));
}

// Return a pointer to idx-ith lut at gpu_index's global memory
Expand All @@ -679,6 +677,22 @@ template <typename Torus> struct int_radix_lut {
return &lut_indexes[ind];
}

// If this function is called we assume the lwe_indexes_(in/out) are not the
// trivial anymore and thus we disable optimizations
void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
Torus *h_indexes_in, Torus *h_indexes_out) {

memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));

cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
num_blocks * sizeof(Torus), stream, gpu_index);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
num_blocks * sizeof(Torus), stream, gpu_index);

using_trivial_lwe_indexes = false;
}

// Broadcast luts from gpu src_gpu_idx to all active gpus
void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t src_gpu_idx) {
Expand Down Expand Up @@ -810,12 +824,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
for (int i = 0; i < bits_per_block; i++)
h_lwe_indexes_in[i + j * bits_per_block] = j;
}
cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
num_radix_blocks * bits_per_block *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_in);

/**
* the output should aim different lwe ciphertexts, so lwe_indexes_out =
Expand All @@ -827,10 +835,11 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
h_lwe_indexes_out[i] = i;

cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
num_radix_blocks * bits_per_block *
sizeof(Torus),
streams[0], gpu_indexes[0]);
lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
h_lwe_indexes_out);

cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_in);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_out);
}
Expand Down Expand Up @@ -1747,19 +1756,23 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {

cudaStream_t *local_streams_1;
cudaStream_t *local_streams_2;
uint32_t active_gpu_count;

int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
SHIFT_OR_ROTATE_TYPE shift_type,
int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
// In the arithmetic shift, a PBS has to be applied to the last rotated
// block twice: once to shift it, once to compute the padding block to be
// copied onto all blocks to the left of the last rotated block
local_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
local_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < gpu_count; j++) {
local_streams_1 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
local_streams_2 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
local_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
local_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
}
Expand Down Expand Up @@ -1892,7 +1905,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {

void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
}
Expand Down Expand Up @@ -1921,20 +1934,24 @@ template <typename Torus> struct int_zero_out_if_buffer {

cudaStream_t *true_streams;
cudaStream_t *false_streams;
uint32_t active_gpu_count;

int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
this->params = params;
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);

Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
if (allocate_gpu_memory) {
tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
// We may use a different stream to allow concurrent operation
true_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
false_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < gpu_count; j++) {
true_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
false_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
true_streams[j] = cuda_create_stream(gpu_indexes[j]);
false_streams[j] = cuda_create_stream(gpu_indexes[j]);
}
Expand All @@ -1943,7 +1960,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
}
Expand Down Expand Up @@ -2377,6 +2394,7 @@ template <typename Torus> struct int_comparison_buffer {
int_radix_lut<Torus> *signed_msb_lut;
cudaStream_t *lsb_streams;
cudaStream_t *msb_streams;
uint32_t active_gpu_count;

int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, COMPARISON_TYPE op,
Expand All @@ -2386,14 +2404,18 @@ template <typename Torus> struct int_comparison_buffer {
this->op = op;
this->is_signed = is_signed;

active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);

identity_lut_f = [](Torus x) -> Torus { return x; };

auto big_lwe_size = params.big_lwe_dimension + 1;

if (allocate_gpu_memory) {
lsb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
msb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < gpu_count; j++) {
lsb_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
msb_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
lsb_streams[j] = cuda_create_stream(gpu_indexes[j]);
msb_streams[j] = cuda_create_stream(gpu_indexes[j]);
}
Expand Down Expand Up @@ -2557,7 +2579,7 @@ template <typename Torus> struct int_comparison_buffer {
signed_msb_lut->release(streams, gpu_indexes, gpu_count);
delete (signed_msb_lut);
}
for (uint j = 0; j < gpu_count; j++) {
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
}
Expand All @@ -2568,6 +2590,7 @@ template <typename Torus> struct int_comparison_buffer {

template <typename Torus> struct int_div_rem_memory {
int_radix_params params;
uint32_t active_gpu_count;

// memory objects for other operations
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
Expand Down Expand Up @@ -2803,6 +2826,8 @@ template <typename Torus> struct int_div_rem_memory {
int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_blocks, bool allocate_gpu_memory) {
active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);

this->params = params;
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
Expand All @@ -2822,11 +2847,15 @@ template <typename Torus> struct int_div_rem_memory {
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);

sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
sub_streams_3 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
sub_streams_4 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < gpu_count; j++) {
sub_streams_1 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_2 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_3 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_4 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
Expand Down Expand Up @@ -2897,7 +2926,7 @@ template <typename Torus> struct int_div_rem_memory {
delete[] merge_overflow_flags_luts;

// release sub streams
for (uint i = 0; i < gpu_count; i++) {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
Expand Down
33 changes: 14 additions & 19 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(

/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_lwe_trivial_scatter<Torus>(
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
num_radix_blocks, big_lwe_dimension + 1, false);
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
Expand All @@ -207,9 +208,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_trivial_gather<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
num_radix_blocks, big_lwe_dimension + 1, false);
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
Expand Down Expand Up @@ -271,13 +273,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// multi_gpu_lwe_scatter<Torus>(
// streams, gpu_indexes, gpu_count, lwe_array_in_vec,
// lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks,
// big_lwe_dimension + 1, false);
multi_gpu_lwe_trivial_scatter<Torus>(
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
num_radix_blocks, big_lwe_dimension + 1, false);
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
Expand All @@ -297,14 +296,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
// multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
// lwe_array_out,
// lwe_after_pbs_vec, h_lwe_indexes_out,
// num_radix_blocks, big_lwe_dimension + 1,
// false);
multi_gpu_lwe_trivial_gather<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
num_radix_blocks, big_lwe_dimension + 1, false);
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
Expand Down
Loading

0 comments on commit 72ad8e6

Please sign in to comment.