Skip to content

Commit

Permalink
chore(gpu): rewrite the scatter/gather logic and move to scratch func…
Browse files Browse the repository at this point in the history
…tions
  • Loading branch information
pdroalves authored and Ubuntu committed Jun 18, 2024
1 parent 128e05e commit 10263d1
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 190 deletions.
30 changes: 30 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,13 @@ template <typename Torus> struct int_radix_lut {
Torus *tmp_lwe_before_ks;
Torus *tmp_lwe_after_ks;

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec;
std::vector<Torus *> lwe_indexes_in_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;

int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
Expand Down Expand Up @@ -548,6 +555,16 @@ template <typename Torus> struct int_radix_lut {
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);

/// With multiple GPUs we allocate arrays to be pushed to the vectors and
/// copy data on each GPU then when we gather data to GPU 0 we can copy
/// back to the original indexing
multi_gpu_lwe_init(streams, gpu_indexes, gpu_count, lwe_array_in_vec,
lwe_indexes_in_vec, num_radix_blocks,
params.big_lwe_dimension + 1);
multi_gpu_lwe_init(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, num_radix_blocks,
params.small_lwe_dimension + 1);

// Keyswitch
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
Expand Down Expand Up @@ -580,6 +597,14 @@ template <typename Torus> struct int_radix_lut {
tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;

/// With multiple GPUs we allocate arrays to be pushed to the vectors and
/// copy data on each GPU then when we gather data to GPU 0 we can copy back
/// to the original indexing
lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
lwe_indexes_in_vec = base_lut_object->lwe_indexes_in_vec;
lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;

mem_reuse = true;

// Allocate LUT
Expand Down Expand Up @@ -701,6 +726,11 @@ template <typename Torus> struct int_radix_lut {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
buffer.clear();

multi_gpu_lwe_release(streams, gpu_indexes, lwe_array_in_vec);
multi_gpu_lwe_release(streams, gpu_indexes, lwe_after_ks_vec);
multi_gpu_lwe_release(streams, gpu_indexes, lwe_indexes_in_vec);
multi_gpu_lwe_release(streams, gpu_indexes, lwe_trivial_indexes_vec);
}
}
};
Expand Down
105 changes: 29 additions & 76 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -160,52 +160,28 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec;
std::vector<Torus *> lwe_indexes_in_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;

/// With multiple GPUs we allocate arrays to be pushed to the vectors and copy
/// data on each GPU then when we gather data to GPU 0 we can copy back to the
/// original indexing
if (gpu_count > 1) {
multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_array_in_vec,
lwe_array_in, lwe_indexes_in_vec,
lut->lwe_indexes_in, num_radix_blocks,
big_lwe_dimension + 1);
multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lut->tmp_lwe_after_ks, lwe_trivial_indexes_vec,
lut->lwe_trivial_indexes, num_radix_blocks,
small_lwe_dimension + 1);
} else {
/// GPU 0 retains the original array
lwe_array_in_vec.push_back(lwe_array_in);
lwe_after_ks_vec.push_back(lut->tmp_lwe_after_ks);
lwe_indexes_in_vec.push_back(lut->lwe_indexes_in);
lwe_trivial_indexes_vec.push_back(lut->lwe_trivial_indexes);
}
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_indexes_in_vec = lut->lwe_indexes_in_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

/// With multiple GPUs we push to the vectors on each GPU then when we gather
/// data to GPU 0 we can copy back to the original indexing
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
lut->lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_indexes_in_vec, ksks, big_lwe_dimension,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);
/// Copy data back to GPU 0 and release vecs
if (gpu_count > 1) {
multi_gpu_gather<Torus>(streams, gpu_indexes, gpu_count,
lut->tmp_lwe_after_ks, lwe_after_ks_vec,
lut->lwe_trivial_indexes, num_radix_blocks,
small_lwe_dimension + 1);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_array_in_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_after_ks_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_indexes_in_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_trivial_indexes_vec);
}
lwe_array_in_vec.clear();
lwe_after_ks_vec.clear();
lwe_indexes_in_vec.clear();
lwe_trivial_indexes_vec.clear();

/// Copy data back to GPU 0
multi_gpu_lwe_gather<Torus>(
streams, gpu_indexes, gpu_count, lut->tmp_lwe_after_ks, lwe_after_ks_vec,
lut->lwe_trivial_indexes, num_radix_blocks, small_lwe_dimension + 1);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand Down Expand Up @@ -233,7 +209,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(

auto params = lut->params;
auto pbs_type = params.pbs_type;
uint32_t big_lwe_dimension = params.big_lwe_dimension;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
Expand All @@ -255,49 +231,26 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec;
std::vector<Torus *> lwe_indexes_in_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;
if (gpu_count > 1) {
multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_array_in_vec,
lwe_array_pbs_in, lwe_indexes_in_vec,
lut->lwe_indexes_in, num_radix_blocks,
big_lwe_dimension + 1);
multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lut->tmp_lwe_after_ks, lwe_trivial_indexes_vec,
lut->lwe_trivial_indexes, num_radix_blocks,
small_lwe_dimension + 1);
} else {
/// GPU 0 retains the original array
lwe_array_in_vec.push_back(lwe_array_pbs_in);
lwe_after_ks_vec.push_back(lut->tmp_lwe_after_ks);
lwe_indexes_in_vec.push_back(lut->lwe_indexes_in);
lwe_trivial_indexes_vec.push_back(lut->lwe_trivial_indexes);
}
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_indexes_in_vec = lut->lwe_indexes_in_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
lut->lwe_indexes_in, num_radix_blocks, big_lwe_dimension + 1);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_indexes_in_vec, ksks, big_lwe_dimension,
lwe_trivial_indexes_vec, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks, false);

/// Copy data back to GPU 0 and release vecs
if (gpu_count > 1) {
multi_gpu_gather<Torus>(streams, gpu_indexes, gpu_count,
lut->tmp_lwe_after_ks, lwe_after_ks_vec,
lut->lwe_trivial_indexes, num_radix_blocks,
small_lwe_dimension + 1);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_array_in_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_after_ks_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_indexes_in_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_trivial_indexes_vec);
}
lwe_array_in_vec.clear();
lwe_after_ks_vec.clear();
lwe_indexes_in_vec.clear();
lwe_trivial_indexes_vec.clear();
multi_gpu_lwe_gather<Torus>(
streams, gpu_indexes, gpu_count, lut->tmp_lwe_after_ks, lwe_after_ks_vec,
lut->lwe_trivial_indexes, num_radix_blocks, small_lwe_dimension + 1);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand Down
63 changes: 22 additions & 41 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
auto message_modulus = mem_ptr->params.message_modulus;
auto carry_modulus = mem_ptr->params.carry_modulus;
auto num_blocks = num_blocks_in_radix;
auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto small_lwe_size = lwe_dimension + 1;

if (old_blocks != terms) {
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
Expand Down Expand Up @@ -334,52 +335,32 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec;
std::vector<Torus *> lwe_indexes_in_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;
if (gpu_count > 1) {
multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_in_vec, new_blocks, lwe_indexes_in_vec,
lwe_indexes_in, message_count,
big_lwe_dimension + 1);
multi_gpu_scatter<Torus>(streams, gpu_indexes, gpu_count,
lwe_after_ks_vec, small_lwe_vector,
lwe_trivial_indexes_vec, lwe_indexes_in,
message_count, lwe_dimension + 1);
} else {
/// GPU 0 retains the original array
lwe_array_in_vec.push_back(new_blocks);
lwe_after_ks_vec.push_back(small_lwe_vector);
lwe_indexes_in_vec.push_back(lwe_indexes_in);
lwe_trivial_indexes_vec.push_back(lwe_indexes_in);
}
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
std::vector<Torus *> small_lwe_vector_vec =
luts_message_carry->lwe_after_ks_vec;
std::vector<Torus *> lwe_indexes_in_vec =
luts_message_carry->lwe_indexes_in_vec;
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;

multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
new_blocks_vec, new_blocks, lwe_indexes_in,
message_count, big_lwe_size);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec,
lwe_indexes_in_vec, ksks,
polynomial_size * glwe_dimension, lwe_dimension,
mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count, true);
execute_keyswitch<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec, ksks,
big_lwe_dimension, lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count, true);

/// Copy data back to GPU 0 and release vecs
if (gpu_count > 1) {
multi_gpu_gather<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
lwe_after_ks_vec, lwe_indexes_in, message_count,
lwe_dimension + 1);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_array_in_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_after_ks_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_indexes_in_vec);
multi_gpu_release<Torus>(streams, gpu_indexes, lwe_trivial_indexes_vec);
}
lwe_array_in_vec.clear();
lwe_after_ks_vec.clear();
lwe_indexes_in_vec.clear();
lwe_trivial_indexes_vec.clear();
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
small_lwe_vector, small_lwe_vector_vec,
lwe_indexes_in, message_count, small_lwe_size);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand Down
Loading

0 comments on commit 10263d1

Please sign in to comment.