Skip to content

Commit

Permalink
chore(gpu): better use active_gpu_count
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jul 19, 2024
1 parent 04ff5af commit 18dbc11
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 26 deletions.
10 changes: 5 additions & 5 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -583,19 +583,19 @@ template <typename Torus> struct int_radix_lut {
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
/// copy data on each GPU then when we gather data to GPU 0 we can copy
/// back to the original indexing
multi_gpu_alloc_lwe(streams, gpu_indexes, gpu_count, lwe_array_in_vec,
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
false);
multi_gpu_alloc_lwe(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
num_radix_blocks, params.small_lwe_dimension + 1,
false);
multi_gpu_alloc_lwe(streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
multi_gpu_alloc_lwe(streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
false);
multi_gpu_alloc_array(streams, gpu_indexes, gpu_count,
multi_gpu_alloc_array(streams, gpu_indexes, active_gpu_count,
lwe_trivial_indexes_vec, num_radix_blocks, false);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_copy_array(streams, gpu_indexes, gpu_count,
multi_gpu_copy_array(streams, gpu_indexes, active_gpu_count,
lwe_trivial_indexes_vec, lwe_trivial_indexes,
num_radix_blocks, false);

Expand Down
5 changes: 2 additions & 3 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,9 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,

/// If the number of radix blocks is lower than the number of GPUs, not all
/// GPUs will be active and there will be 1 input per GPU
auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
if (sync_streams)
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
for (uint i = 0; i < gpu_count; i++) {
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);

Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
Expand All @@ -158,7 +157,7 @@ void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
}

if (sync_streams)
for (uint i = 0; i < active_gpu_count; i++) {
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
Expand Down
16 changes: 8 additions & 8 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
Expand All @@ -200,15 +200,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);
Expand Down Expand Up @@ -274,12 +274,12 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
Expand All @@ -288,15 +288,15 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

multi_gpu_scatter_lwe<Torus>(
streams, gpu_indexes, gpu_count, new_blocks_vec, new_blocks,
streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
big_lwe_size, false);
Expand All @@ -384,7 +384,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
execute_keyswitch<Torus>(streams, gpu_indexes, active_gpu_count,
small_lwe_vector_vec, lwe_trivial_indexes_vec,
new_blocks_vec, lwe_trivial_indexes_vec, ksks,
big_lwe_dimension, small_lwe_dimension,
Expand All @@ -394,7 +394,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
Expand All @@ -403,7 +403,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
mem_ptr->params.pbs_type, false);

multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
multi_gpu_gather_lwe<Torus>(streams, gpu_indexes, active_gpu_count, new_blocks,
lwe_after_pbs_vec,
luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,6 @@ void execute_pbs(
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type, bool sync_streams = true) {
auto active_gpu_count =
get_active_gpu_count(input_lwe_ciphertext_count, gpu_count);
if (sync_streams)
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
switch (sizeof(Torus)) {
Expand All @@ -140,7 +138,7 @@ void execute_pbs(
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case CLASSICAL:
for (uint i = 0; i < active_gpu_count; i++) {
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);

Expand Down Expand Up @@ -179,7 +177,7 @@ void execute_pbs(
case MULTI_BIT:
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
for (uint i = 0; i < active_gpu_count; i++) {
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);

Expand Down Expand Up @@ -208,7 +206,7 @@ void execute_pbs(
}
break;
case CLASSICAL:
for (uint i = 0; i < active_gpu_count; i++) {
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);

Expand Down Expand Up @@ -246,7 +244,7 @@ void execute_pbs(
}

if (sync_streams)
for (uint i = 0; i < active_gpu_count; i++) {
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
Expand Down

0 comments on commit 18dbc11

Please sign in to comment.