Skip to content

Commit

Permalink
chore(gpu): fix syncs
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jul 19, 2024
1 parent 18dbc11 commit 8dffbbc
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 149 deletions.
26 changes: 13 additions & 13 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -583,15 +583,15 @@ template <typename Torus> struct int_radix_lut {
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
/// copy data on each GPU then when we gather data to GPU 0 we can copy
/// back to the original indexing
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
false);
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
num_radix_blocks, params.small_lwe_dimension + 1,
false);
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
false);
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count,
lwe_array_in_vec, num_radix_blocks,
params.big_lwe_dimension + 1, false);
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, num_radix_blocks,
params.small_lwe_dimension + 1, false);
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count,
lwe_after_pbs_vec, num_radix_blocks,
params.big_lwe_dimension + 1, false);
multi_gpu_alloc_array(streams, gpu_indexes, active_gpu_count,
lwe_trivial_indexes_vec, num_radix_blocks, false);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
Expand Down Expand Up @@ -778,10 +778,10 @@ template <typename Torus> struct int_radix_lut {
}
buffer.clear();

multi_gpu_release_lwe(streams, gpu_indexes, lwe_array_in_vec, false);
multi_gpu_release_lwe(streams, gpu_indexes, lwe_after_ks_vec, false);
multi_gpu_release_lwe(streams, gpu_indexes, lwe_after_pbs_vec, false);
multi_gpu_release_lwe(streams, gpu_indexes, lwe_trivial_indexes_vec);
multi_gpu_release(streams, gpu_indexes, lwe_array_in_vec, false);
multi_gpu_release(streams, gpu_indexes, lwe_after_ks_vec, false);
multi_gpu_release(streams, gpu_indexes, lwe_after_pbs_vec, false);
multi_gpu_release(streams, gpu_indexes, lwe_trivial_indexes_vec);
lwe_array_in_vec.clear();
lwe_after_ks_vec.clear();
lwe_after_pbs_vec.clear();
Expand Down
10 changes: 1 addition & 9 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,10 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
const LweArrayVariant<Torus> &lwe_input_indexes,
Torus **ksks, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples,
bool sync_streams = true) {
uint32_t level_count, uint32_t num_samples) {

/// If the number of radix blocks is lower than the number of GPUs, not all
/// GPUs will be active and there will be 1 input per GPU
if (sync_streams)
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < gpu_count; i++) {
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);

Expand All @@ -155,11 +152,6 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
base_log, level_count, num_samples_on_gpu);
}

if (sync_streams)
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}

#endif
81 changes: 41 additions & 40 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -168,17 +168,17 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);
num_radix_blocks);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
execute_pbs<Torus>(streams, gpu_indexes, 1, lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
lut->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
Expand All @@ -188,14 +188,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
multi_gpu_scatter_lwe<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);
big_lwe_dimension + 1);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, ksks,
big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand All @@ -205,13 +205,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);
num_radix_blocks, big_lwe_dimension + 1);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
Expand Down Expand Up @@ -260,30 +261,30 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);
num_radix_blocks);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
execute_pbs<Torus>(streams, gpu_indexes, 1, lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
lut->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, ksks,
big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand All @@ -293,13 +294,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);
num_radix_blocks, big_lwe_dimension + 1);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
Expand Down Expand Up @@ -674,8 +676,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1,
false);
params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);

cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
mem_ptr->tmp_small_lwe_vector,
Expand Down
27 changes: 15 additions & 12 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
polynomial_size * glwe_dimension,
small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count, false);
mem_ptr->params.ks_level, message_count);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand All @@ -370,26 +370,25 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
mem_ptr->params.pbs_type, false);
mem_ptr->params.pbs_type);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

multi_gpu_scatter_lwe<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
big_lwe_size, false);
big_lwe_size);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count,
small_lwe_vector_vec, lwe_trivial_indexes_vec,
new_blocks_vec, lwe_trivial_indexes_vec, ksks,
big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, total_count, false);
execute_keyswitch<Torus>(
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand All @@ -401,13 +400,17 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
mem_ptr->params.pbs_type, false);
mem_ptr->params.pbs_type);

multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, new_blocks,
lwe_after_pbs_vec,
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count,
new_blocks, lwe_after_pbs_vec,
luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes,
total_count, big_lwe_size);
/// Synchronize all GPUs
for (uint i = 1; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}

int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,7 @@ void execute_pbs(
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type, bool sync_streams = true) {
if (sync_streams)
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
Expand Down Expand Up @@ -242,11 +240,6 @@ void execute_pbs(
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
"moduli are supported.")
}

if (sync_streams)
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}

template <typename Torus>
Expand Down
Loading

0 comments on commit 8dffbbc

Please sign in to comment.