Skip to content

Commit

Permalink
refactor(gpu): modify apply lut methods to assume inputs and outputs …
Browse files Browse the repository at this point in the history
…have trivial indexing
  • Loading branch information
pdroalves authored and agnesLeroy committed Jun 27, 2024
1 parent cf72e95 commit 1bfa71f
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 65 deletions.
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ template <typename Torus> struct int_radix_lut {
cudaSetDevice(i);
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count);
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);

execute_scratch_pbs<Torus>(
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
Expand Down
80 changes: 35 additions & 45 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -155,16 +155,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;

/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
if (gpu_count == 1) {

auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
Expand All @@ -181,21 +180,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
} else {
auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_lwe_scatter<Torus>(
multi_gpu_lwe_trivial_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);
num_radix_blocks, big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
Expand All @@ -215,15 +207,14 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, h_lwe_indexes_out,
num_radix_blocks, big_lwe_dimension + 1, false);
}
multi_gpu_lwe_trivial_gather<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
num_radix_blocks, big_lwe_dimension + 1, false);

/// Synchronize all GPUs
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}

Expand Down Expand Up @@ -254,15 +245,15 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
num_radix_blocks);
check_cuda_error(cudaGetLastError());

cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
if (gpu_count == 1) {

auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
Expand All @@ -279,19 +270,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
} else {
auto h_lwe_indexes_in = lut->h_lwe_indexes_in;
auto h_lwe_indexes_out = lut->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lut->lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lut->lwe_indexes_out,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

multi_gpu_lwe_scatter<Torus>(
// multi_gpu_lwe_scatter<Torus>(
// streams, gpu_indexes, gpu_count, lwe_array_in_vec,
// lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks,
// big_lwe_dimension + 1, false);
multi_gpu_lwe_trivial_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
h_lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1, false);
num_radix_blocks, big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
Expand All @@ -311,15 +297,19 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, h_lwe_indexes_out,
num_radix_blocks, big_lwe_dimension + 1, false);
}

/// Synchronize all GPUs
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
// lwe_array_out,
// lwe_after_pbs_vec, h_lwe_indexes_out,
// num_radix_blocks, big_lwe_dimension + 1,
// false);
multi_gpu_lwe_trivial_gather<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
num_radix_blocks, big_lwe_dimension + 1, false);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}

Expand Down
19 changes: 10 additions & 9 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,6 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(

luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
Expand All @@ -343,21 +341,24 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;

if (gpu_count == 1) {
// TODO We should be able to run keyswitch on a single GPU and PBS on
// multiple GPUs if needed
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
if (active_gpu_count == 1) {
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector, lwe_indexes_in,
new_blocks, lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
message_count, true);
execute_keyswitch<Torus>(streams, gpu_indexes, 1, small_lwe_vector,
lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
polynomial_size * glwe_dimension, lwe_dimension,
mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count, true);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, new_blocks, lwe_indexes_out,
streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
glwe_dimension, lwe_dimension, polynomial_size,
Expand Down
20 changes: 10 additions & 10 deletions backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void multi_gpu_lwe_init(cudaStream_t *streams, uint32_t *gpu_indexes,
dest.resize(active_gpu_count);
#pragma omp parallel for num_threads(active_gpu_count)
for (uint i = 0; i < active_gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
Torus *d_array = (Torus *)cuda_malloc_async(
inputs_on_gpu * elements_per_input * sizeof(Torus), streams[i],
gpu_indexes[i]);
Expand Down Expand Up @@ -83,12 +83,12 @@ void multi_gpu_lwe_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

dest.resize(active_gpu_count);
// #pragma omp parallel for num_threads(active_gpu_count)
#pragma omp parallel for num_threads(active_gpu_count)
for (uint i = 0; i < active_gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
}
auto src_indexes = h_src_indexes + gpu_offset;

Expand Down Expand Up @@ -127,10 +127,10 @@ void multi_gpu_lwe_gather(cudaStream_t *streams, uint32_t *gpu_indexes,

#pragma omp parallel for num_threads(active_gpu_count)
for (uint i = 0; i < active_gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
}
auto dest_indexes = h_dest_indexes + gpu_offset;

Expand Down Expand Up @@ -170,10 +170,10 @@ void multi_gpu_lwe_trivial_scatter(cudaStream_t *streams, uint32_t *gpu_indexes,

#pragma omp parallel for num_threads(active_gpu_count)
for (uint i = 0; i < active_gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
}

auto d_dest = dest[i];
Expand Down Expand Up @@ -206,10 +206,10 @@ void multi_gpu_lwe_trivial_gather(cudaStream_t *streams, uint32_t *gpu_indexes,

#pragma omp parallel for num_threads(active_gpu_count)
for (uint i = 0; i < active_gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, active_gpu_count);
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, active_gpu_count);
}

auto d_dest = dest + gpu_offset * elements_per_input;
Expand Down

0 comments on commit 1bfa71f

Please sign in to comment.