Skip to content

Commit

Permalink
chore(gpu): rework select to avoid using local streams
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Dec 13, 2024
1 parent 2ad997a commit a4a5e0e
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 64 deletions.
66 changes: 30 additions & 36 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -2954,14 +2954,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {

template <typename Torus> struct int_cmux_buffer {
int_radix_lut<Torus> *predicate_lut;
int_radix_lut<Torus> *inverted_predicate_lut;
int_radix_lut<Torus> *message_extract_lut;

Torus *tmp_true_ct;
Torus *tmp_false_ct;

int_zero_out_if_buffer<Torus> *zero_if_true_buffer;
int_zero_out_if_buffer<Torus> *zero_if_false_buffer;
Torus *buffer_in;
Torus *buffer_out;
Torus *condition_array;

int_radix_params params;

Expand All @@ -2977,17 +2974,12 @@ template <typename Torus> struct int_cmux_buffer {
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);

tmp_true_ct =
(Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
tmp_false_ct =
(Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);

zero_if_true_buffer = new int_zero_out_if_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
allocate_gpu_memory);
zero_if_false_buffer = new int_zero_out_if_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
allocate_gpu_memory);
buffer_in =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
buffer_out =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
condition_array =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);

auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus {
return predicate_lut_f(condition) ? 0 : block;
Expand All @@ -3001,12 +2993,8 @@ template <typename Torus> struct int_cmux_buffer {
};

predicate_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory);

inverted_predicate_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory);
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
2 * num_radix_blocks, allocate_gpu_memory);

message_extract_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
Expand All @@ -3015,40 +3003,46 @@ template <typename Torus> struct int_cmux_buffer {
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], predicate_lut->get_lut(0, 0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f);
params.carry_modulus, inverted_lut_f);

generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], inverted_predicate_lut->get_lut(0, 0),
streams[0], gpu_indexes[0], predicate_lut->get_lut(0, 1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, inverted_lut_f);
params.carry_modulus, lut_f);

generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_extract_lut->get_lut(0, 0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, message_extract_lut_f);
Torus *h_lut_indexes =
(Torus *)malloc(2 * num_radix_blocks * sizeof(Torus));
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_async_to_gpu(
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);

predicate_lut->broadcast_lut(streams, gpu_indexes, 0);
inverted_predicate_lut->broadcast_lut(streams, gpu_indexes, 0);
message_extract_lut->broadcast_lut(streams, gpu_indexes, 0);
free(h_lut_indexes);
}
}

void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
predicate_lut->release(streams, gpu_indexes, gpu_count);
delete predicate_lut;
inverted_predicate_lut->release(streams, gpu_indexes, gpu_count);
delete inverted_predicate_lut;
message_extract_lut->release(streams, gpu_indexes, gpu_count);
delete message_extract_lut;

zero_if_true_buffer->release(streams, gpu_indexes, gpu_count);
delete zero_if_true_buffer;
zero_if_false_buffer->release(streams, gpu_indexes, gpu_count);
delete zero_if_false_buffer;

cuda_drop_async(tmp_true_ct, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_false_ct, streams[0], gpu_indexes[0]);
cuda_drop_async(buffer_in, streams[0], gpu_indexes[0]);
cuda_drop_async(buffer_out, streams[0], gpu_indexes[0]);
cuda_drop_async(condition_array, streams[0], gpu_indexes[0]);
}
};

Expand Down
49 changes: 21 additions & 28 deletions backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,39 +37,32 @@ __host__ void host_integer_radix_cmux_kb(
uint32_t num_radix_blocks) {

auto params = mem_ptr->params;

// Since our CPU threads will be working on different streams we shall assert
// the work in the main stream is completed
auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}

auto mem_true = mem_ptr->zero_if_true_buffer;
zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
lwe_array_true, lwe_condition, mem_true,
mem_ptr->inverted_predicate_lut, bsks, ksks,
num_radix_blocks);
auto mem_false = mem_ptr->zero_if_false_buffer;
zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
mem_false, mem_ptr->predicate_lut, bsks, ksks,
num_radix_blocks);
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
}
for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
Torus lwe_size = params.big_lwe_dimension + 1;
Torus radix_lwe_size = lwe_size * num_radix_blocks;
cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in, lwe_array_true,
radix_lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in + radix_lwe_size,
lwe_array_false, radix_lwe_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
for (uint i = 0; i < 2 * num_radix_blocks; i++) {
cuda_memcpy_async_gpu_to_gpu(mem_ptr->condition_array + i * lwe_size,
lwe_condition, lwe_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
mem_ptr->condition_array, bsks, ksks, 2 * num_radix_blocks,
mem_ptr->predicate_lut, params.message_modulus);

// If the condition was true, true_ct will have kept its value and false_ct
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
// have kept its value
auto added_cts = mem_ptr->tmp_true_ct;
host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
params.big_lwe_dimension, num_radix_blocks);
auto mem_true = mem_ptr->buffer_out;
auto mem_false = &mem_ptr->buffer_out[radix_lwe_size];
auto added_cts = mem_true;
host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
mem_false, params.big_lwe_dimension, num_radix_blocks);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
Expand Down

0 comments on commit a4a5e0e

Please sign in to comment.