Skip to content

Commit

Permalink
refactor(gpu): implement a bypass to avoid wasting time with multi-gp…
Browse files Browse the repository at this point in the history
…u logic in scatter/gather logic when we are on a single-gpu environment
  • Loading branch information
pdroalves authored and agnesLeroy committed Jun 27, 2024
1 parent 3276661 commit cf72e95
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 125 deletions.
5 changes: 0 additions & 5 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,6 @@ template <typename Torus> struct int_radix_lut {
// lwe_indexes_in != lwe_indexes_out
Torus *lwe_trivial_indexes;
Torus *tmp_lwe_before_ks;
Torus *tmp_lwe_after_ks;

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
Expand Down Expand Up @@ -583,8 +582,6 @@ template <typename Torus> struct int_radix_lut {
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
tmp_lwe_before_ks =
(Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
tmp_lwe_after_ks =
(Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
}
}

Expand All @@ -606,7 +603,6 @@ template <typename Torus> struct int_radix_lut {
buffer = base_lut_object->buffer;
// Keyswitch
tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;

/// With multiple GPUs we allocate arrays to be pushed to the vectors and
/// copy data on each GPU then when we gather data to GPU 0 we can copy back
Expand Down Expand Up @@ -729,7 +725,6 @@ template <typename Torus> struct int_radix_lut {

if (!mem_reuse) {
cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (int i = 0; i < buffer.size(); i++) {
switch (params.pbs_type) {
Expand Down
176 changes: 105 additions & 71 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -164,44 +164,61 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
if (gpu_count == 1) {
execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
} else {
auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// With multiple GPUs we push to the vectors on each GPU then when we gather
/// data to GPU 0 we can copy back to the original indexing
multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_in_vec, lwe_array_in, h_lwe_indexes_in,
num_radix_blocks, big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, h_lwe_indexes_out,
num_radix_blocks, big_lwe_dimension + 1, false);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, h_lwe_indexes_out,
num_radix_blocks, big_lwe_dimension + 1, false);
}

/// Synchronize all GPUs
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
Expand Down Expand Up @@ -245,42 +262,59 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
if (gpu_count == 1) {
execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
} else {
auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);

multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, h_lwe_indexes_out,
num_radix_blocks, big_lwe_dimension + 1, false);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, h_lwe_indexes_out,
num_radix_blocks, big_lwe_dimension + 1, false);
}

/// Synchronize all GPUs
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
Expand Down
123 changes: 74 additions & 49 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -343,55 +343,80 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;

auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in;
auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in,
total_count * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out,
total_count * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
new_blocks_vec, new_blocks, h_lwe_indexes_in,
message_count, big_lwe_size, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, ksks,
big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count, false);

/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
/// different configuration
multi_gpu_lwe_gather<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector, small_lwe_vector_vec,
h_lwe_indexes_in, message_count, small_lwe_size);

multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec, small_lwe_vector,
h_lwe_indexes_in, total_count, small_lwe_size, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec,
small_lwe_vector_vec, lwe_trivial_indexes_vec, bsks,
luts_message_carry->buffer, glwe_dimension,
lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count, 2, 0,
max_shared_memory, mem_ptr->params.pbs_type, false);

multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
lwe_after_pbs_vec, h_lwe_indexes_out,
total_count, big_lwe_size);
if (gpu_count == 1) {
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector, lwe_indexes_in,
new_blocks, lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
message_count, true);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, new_blocks, lwe_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
glwe_dimension, lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
mem_ptr->params.pbs_type, true);
} else {
auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in;
auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in,
total_count * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out,
total_count * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
new_blocks_vec, new_blocks, h_lwe_indexes_in,
message_count, big_lwe_size, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count, false);

/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
/// different configuration
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
small_lwe_vector, small_lwe_vector_vec,
h_lwe_indexes_in, message_count,
small_lwe_size);

multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
small_lwe_vector_vec, small_lwe_vector,
h_lwe_indexes_in, total_count,
small_lwe_size, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
glwe_dimension, lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
mem_ptr->params.pbs_type, false);

multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
lwe_after_pbs_vec, h_lwe_indexes_out,
total_count, big_lwe_size);
}

luts_message_carry->release(streams, gpu_indexes, gpu_count);

Expand Down

0 comments on commit cf72e95

Please sign in to comment.