Skip to content

Commit

Permalink
refactor(gpu): add a parameter to enable optimizations in case lwe_in…
Browse files Browse the repository at this point in the history
…dexes_(in/out) is trivial
  • Loading branch information
pdroalves committed Jun 26, 2024
1 parent 995c3e1 commit 5eaf4fe
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 160 deletions.
61 changes: 35 additions & 26 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,8 @@ template <typename Torus> struct int_radix_lut {
Torus *lwe_indexes_out;
Torus *h_lwe_indexes_in;
Torus *h_lwe_indexes_out;
// Enable optimizations if lwe_indexes_(in/out) are trivial
bool using_trivial_lwe_indexes = true;
// lwe_trivial_indexes is the intermediary index we need in case
// lwe_indexes_in != lwe_indexes_out
Torus *lwe_trivial_indexes;
Expand Down Expand Up @@ -537,22 +539,20 @@ template <typename Torus> struct int_radix_lut {
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
h_lwe_indexes_in[i] = i;

cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus));

/// With multiple GPUs we allocate arrays to be pushed to the vectors and
/// copy data on each GPU then when we gather data to GPU 0 we can copy
Expand Down Expand Up @@ -641,22 +641,20 @@ template <typename Torus> struct int_radix_lut {
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
h_lwe_indexes_in[i] = i;

cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes);
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
num_radix_blocks * sizeof(Torus));
}

// Return a pointer to idx-ith lut at gpu_index's global memory
Expand All @@ -674,6 +672,22 @@ template <typename Torus> struct int_radix_lut {
return &lut_indexes[ind];
}

// If this function is called we assume the lwe_indexes_(in/out) are not the
// trivial anymore and thus we disable optimizations
void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
Torus *h_indexes_in, Torus *h_indexes_out) {

memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));

cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
num_blocks * sizeof(Torus), stream, gpu_index);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
num_blocks * sizeof(Torus), stream, gpu_index);

using_trivial_lwe_indexes = false;
}

// Broadcast luts from gpu src_gpu_idx to all active gpus
void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t src_gpu_idx) {
Expand Down Expand Up @@ -805,12 +819,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
for (int i = 0; i < bits_per_block; i++)
h_lwe_indexes_in[i + j * bits_per_block] = j;
}
cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
num_radix_blocks * bits_per_block *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_in);

/**
* the output should aim different lwe ciphertexts, so lwe_indexes_out =
Expand All @@ -822,10 +830,11 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
h_lwe_indexes_out[i] = i;

cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
num_radix_blocks * bits_per_block *
sizeof(Torus),
streams[0], gpu_indexes[0]);
lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
h_lwe_indexes_out);

cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_in);
cuda_stream_add_callback(streams[0], gpu_indexes[0],
host_free_on_stream_callback, h_lwe_indexes_out);
}
Expand Down
33 changes: 14 additions & 19 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(

/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_lwe_trivial_scatter<Torus>(
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_in,
num_radix_blocks, big_lwe_dimension + 1, false);
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
Expand All @@ -207,9 +208,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
multi_gpu_lwe_trivial_gather<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
num_radix_blocks, big_lwe_dimension + 1, false);
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
Expand Down Expand Up @@ -271,13 +273,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// multi_gpu_lwe_scatter<Torus>(
// streams, gpu_indexes, gpu_count, lwe_array_in_vec,
// lwe_array_pbs_in, h_lwe_indexes_in, num_radix_blocks,
// big_lwe_dimension + 1, false);
multi_gpu_lwe_trivial_scatter<Torus>(
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_in_vec, lwe_array_pbs_in,
num_radix_blocks, big_lwe_dimension + 1, false);
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, lwe_after_ks_vec,
Expand All @@ -297,14 +296,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);

/// Copy data back to GPU 0 and release vecs
// multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
// lwe_array_out,
// lwe_after_pbs_vec, h_lwe_indexes_out,
// num_radix_blocks, big_lwe_dimension + 1,
// false);
multi_gpu_lwe_trivial_gather<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_after_pbs_vec,
num_radix_blocks, big_lwe_dimension + 1, false);
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1, false);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
Expand Down
34 changes: 16 additions & 18 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -366,19 +366,13 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
mem_ptr->params.pbs_type, true);
} else {
auto h_lwe_indexes_in = luts_message_carry->h_lwe_indexes_in;
auto h_lwe_indexes_out = luts_message_carry->h_lwe_indexes_out;
cuda_memcpy_async_to_cpu(h_lwe_indexes_in, lwe_indexes_in,
total_count * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(h_lwe_indexes_out, lwe_indexes_out,
total_count * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
new_blocks_vec, new_blocks, h_lwe_indexes_in,
message_count, big_lwe_size, false);
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, new_blocks_vec, new_blocks,
luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
big_lwe_size, false);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
Expand All @@ -394,13 +388,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
/// different configuration
multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count,
small_lwe_vector, small_lwe_vector_vec,
h_lwe_indexes_in, message_count,
small_lwe_size);
luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes,
message_count, small_lwe_size);

multi_gpu_lwe_scatter<Torus>(streams, gpu_indexes, gpu_count,
small_lwe_vector_vec, small_lwe_vector,
h_lwe_indexes_in, total_count,
small_lwe_size, false);
multi_gpu_lwe_scatter<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
small_lwe_size, false);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
Expand All @@ -415,7 +411,9 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
mem_ptr->params.pbs_type, false);

multi_gpu_lwe_gather<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
lwe_after_pbs_vec, h_lwe_indexes_out,
lwe_after_pbs_vec,
luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes,
total_count, big_lwe_size);
}

Expand Down
Loading

0 comments on commit 5eaf4fe

Please sign in to comment.